diff --git a/Makefile b/Makefile index 179f23f5d45..a1ee6f14bb7 100644 --- a/Makefile +++ b/Makefile @@ -1,18 +1,18 @@ .PHONY: quality style test -check_dirs := tests src benchmarks metrics +check_dirs := tests src benchmarks metrics utils # Check that source code meets quality standards quality: - black --check $(check_dirs) - ruff $(check_dirs) + black --check $(check_dirs) setup.py + ruff $(check_dirs) setup.py # Format source code automatically style: - black tests src benchmarks metrics - ruff $(check_dirs) --fix + black tests src benchmarks metrics setup.py + ruff $(check_dirs) setup.py --fix # Run tests for the library diff --git a/README.md b/README.md index 970ad5154c1..6f17839e937 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ [🎓 **Documentation**](https://huggingface.co/docs/datasets/) [🕹 **Colab tutorial**](https://colab.research.google.com/github/huggingface/datasets/blob/main/notebooks/Overview.ipynb) -[🔎 **Find a dataset in the Hub**](https://huggingface.co/datasets) [🌟 **Add a new dataset to the Hub**](https://huggingface.co/docs/datasets/share.html) +[🔎 **Find a dataset in the Hub**](https://huggingface.co/datasets) [🌟 **Share a dataset on the Hub**](https://huggingface.co/docs/datasets/share)

@@ -155,9 +155,7 @@ If you are familiar with the great TensorFlow Datasets, here are the main differ # Disclaimers -Similar to TensorFlow Datasets, 🤗 Datasets is a utility library that downloads and prepares public datasets. We do not host or distribute most of these datasets, vouch for their quality or fairness, or claim that you have license to use them. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license. - -Moreover 🤗 Datasets may run Python code defined by the dataset authors to parse certain data formats or structures. For security reasons, we ask users to: +🤗 Datasets may run Python code defined by the dataset authors to parse certain data formats or structures. For security reasons, we ask users to: - check the dataset scripts they're going to run beforehand and - pin the `revision` of the repositories they use. diff --git a/convert_dataset.sh b/convert_dataset.sh deleted file mode 100755 index b1cfa744f14..00000000000 --- a/convert_dataset.sh +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env bash - -pathToFile=${1} -manual_dir=${2} - -curPath=$(pwd) - -if [ ! -f "${pathToFile}" ]; then - echo "${pathToFile} does not exist" - exit -fi - -tfdsFolder=$(python -c "print('/'.join(\"${pathToFile}\".split('/')[:-1]))") -datasetName=$(python -c "print(\"${pathToFile}\".split('/')[-1].split('.')[0])") - -# Step 0 - -# Uncomment if you want to clean your cache -#echo "### STEP 0 ### Clean your cache..." -#rm -rf "${curPath}/src/datasets/datasets/*" -#rm -rf "~/.cache/huggingface/datasets/*" - -# Step 1 - -pathToFolder="datasets/${datasetName}" - -echo "" -echo "" -if [ -f "${pathToFolder}/${datasetName}.py" ]; then - echo "### STEP 1 ### ${datasetName} is already converted. To convert it again remove ${pathToFolder}/${datasetName}." -else - echo "### STEP 1 ### Converting ${datasetName} dataset ..." - eval "datasets-cli convert --tfds_path ${pathToFile} --datasets_directory datasets/" -fi - -if [ -f "${pathToFolder}/${datasetName}.py" ]; then - echo "${datasetName}.py found in ${pathToFolder}" -else - echo "${pathToFolder} must have a ${datasetName}.py, but was not found. Conversion error. Check conversion manually." - exit -fi - -echo "Conversion successful!" - -# STEP 2 - -echo "" -echo "" -if [ -f "${pathToFolder}/dataset_infos.json" ]; then - echo "### STEP 2 ### Dataset infos file is already created. To create it again remove ${pathToFolder}/dataset_infos.json ..." -else - echo "### STEP 2 ### Create infos ..." - if [ -z "${manual_dir}" ]; then - eval "datasets-cli test ${pathToFolder} --save_infos --all_configs" - else - eval "datasets-cli test ${pathToFolder} --data_dir ${manual_dir} --save_infos --all_configs" - fi -fi - -if [ -f "${pathToFolder}/dataset_infos.json" ]; then - echo "dataset_infos.json found in ${pathToFolder}." -else - echo "dataset_infos.json not found in ${pathToFolder}. Add dataset infos manually." - exit -fi - -# rm lock file -rm ${pathToFolder}/*.lock - -echo "Dataset infos creation succesful!" - -echo "" -echo "" -echo "### STEP 3 ### Make style ..." -eval "make style" - -echo "" -echo "" - -cd ${pathToFolder} -name=${datasetName} -builderName=$(python -c "import stringcase; print(stringcase.pascalcase(\"${name}\"));") - -configNames=$(python -c "from ${name} import ${builderName}; [print(x.name) for x in ${builderName}.BUILDER_CONFIGS];") - -versions=$(python -c "from ${name} import ${builderName}; [print(str(x.version.major) + '.' + str(x.version.minor) + '.' + str(x.version.patch)) for x in ${builderName}.BUILDER_CONFIGS];") - -mainVersion=$(python -c "from ${name} import ${builderName}; print(str(${builderName}.VERSION.major) + '.' + str(${builderName}.VERSION.minor) + '.' + str(${builderName}.VERSION.patch));") - -if [ ! -z "${versions}" ]; then - versionArray=(`echo $versions`) -else - versionArray=(`echo $mainVersion`) -fi - -for version in "${versionArray[@]}"; do - echo "Found version name ${version}" - firstVersion=${versionArray[0]} -done - -configArray=(`echo $configNames`) -for config in "${configArray[@]}"; do - echo "Found config name ${config}" - firstConfig=${configArray[0]} -done - -if [ -d "./dummy" ]; then - echo "### STEP 4 & 5 ### dummy folder is already created. To rerun the command, delete ${pathToFolder}/dummy" - cd ${curPath} -else - echo "### STEP 4 ### Create dummy folder structure..." - - if [ -z "${configNames}" ]; then - echo "${datasetName} has no configs. Create dummy data without config folder ... " - - mkdir -p ${curPath}/${pathToFolder}/dummy/${firstVersion}/ - echo "Created ${curPath}/${pathToFolder}/dummy/${firstVersion} ..." - else - echo "${datasetName} has config. Create dummy data with config folder ... " - for ((i=0;i<${#configArray[@]};++i)); do - config=${configArray[i]} - version=${versionArray[i]} - mkdir -p ${curPath}/${pathToFolder}/dummy/${config}/${version}/ - echo "Created ${curPath}/${pathToFolder}/dummy/${config}/${version} ..." - done - fi - - - cd ${curPath} - - echo "" - echo "" - echo "### STEP 5 ### Create dummy data from ${fakeDataFolder}" - - echo "${tfdsFolder}" - fakeDataFolder=$(readlink -m ${tfdsFolder}/../testing/test_data/fake_examples/${datasetName}) - - if [ -d "${fakeDataFolder}" ]; then - echo "fake data folder found in ${fakeDataFolder}" - else - echo "fake data folder not found. ${fakeDataFolder} does not exist. Create dummy data manually." - exit - fi - - echo "Zipping and copying data from ${fakeDataFolder}..." - cd "${fakeDataFolder}" - dirFilesAndFolders=$(ls) - mkdir dummy_data - for dir in "${dirFilesAndFolders}"; do - echo "Adding ${dir} to dummy_data zip dir" - cp -r ${dir} dummy_data/ - done - eval "zip -r dummy_data.zip dummy_data" - rm -r dummy_data - - # Copy zipped data to correct file - if [ -z "${configNames}" ]; then - eval "mv dummy_data.zip ${curPath}/${pathToFolder}/dummy/${version}/dummy_data.zip" - else - if [ "${#configArray[@]}" -gt 1 ]; then - echo "Dataset has multiple configs. Copy zip data to first config: ${firstConfig}..." - echo "IMPORTANT: Fix zipped dummy data manually!" - eval "mv dummy_data.zip ${curPath}/${pathToFolder}/dummy/${firstConfig}/${version}/dummy_data.zip" - else - echo "Copy zip data to first config: ${firstConfig}..." - eval "mv dummy_data.zip ${curPath}/${pathToFolder}/dummy/${firstConfig}/${version}/dummy_data.zip" - fi - fi - cd "${curPath}" -fi - -# rm pycache -rm -rf ${pathToFolder}/__pycache__ - -if [ -f ${curPath}/${pathToFolder}/dummy/${firstVersion}/dummy_data.zip ] || [ -f ${curPath}/${pathToFolder}/dummy/${firstConfig}/${firstVersion}/dummy_data.zip ] ; then - echo "" - echo "" - echo "Conversion succesful!" - echo "" - echo "" - echo "Check that the following two commands work:" - echo "RUN_SLOW=1 pytest tests/test_dataset_common.py::DatasetTest::test_load_real_dataset_local_${datasetName}" -echo "RUN_SLOW=1 pytest tests/test_dataset_common.py::DatasetTest::test_load_dataset_all_configs_local_${datasetName}" -echo "pytest tests/test_dataset_common.py::DatasetTest::test_load_dataset_local_${datasetName}" -fi diff --git a/setup.py b/setup.py index 8fb1be73db4..2a0e3d746ff 100644 --- a/setup.py +++ b/setup.py @@ -253,7 +253,7 @@ package_dir={"": "src"}, packages=find_packages("src"), package_data={ - "datasets": ["py.typed", "scripts/templates/*"], + "datasets": ["py.typed"], "datasets.utils.resources": ["*.json", "*.yaml", "*.tsv"], }, entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]}, diff --git a/tests/test_dataset_scripts.py b/tests/test_dataset_scripts.py deleted file mode 100644 index fcb06f76517..00000000000 --- a/tests/test_dataset_scripts.py +++ /dev/null @@ -1,67 +0,0 @@ -import re -from pathlib import Path -from unittest import TestCase - -import pytest - - -@pytest.mark.integration -class TestDatasetScripts(TestCase): - def _no_encoding_on_file_open(self, filepath: str): - r"""Find all instances where a non-binary file is opened without UTF-8 encoding. - - This function uses regular expressions to find instances where Python's `open()` function is used to open - non-binary files. See below for an explanation of the regular expression: - - (?!.*\b(?:encoding|rb|w|wb|w+|wb+|ab|ab+)\b): Lookahead and discard match if `encoding` or `rb` etc are - arguments of `open()`. - - (?<=\s): Lookbehind and match if `open()` predeceded by one whitespace. - - (open)\((.*)\): Capture everything in parentheses of `open()`. - """ - - with open(filepath, encoding="utf-8") as input_file: - regexp = re.compile(r"(?!.*\b(?:encoding|rb|w|wb|w+|wb+|ab|ab+)\b)(?<=\s)(open)\((.*)\)") - input_text = input_file.read() - match = regexp.search(input_text) - - return match - - def _no_print_statements(self, filepath: str): - r"""Find all instances where a python sctipt file contains a `print` statement. - - #[^\r\n]*print\(: Match print statement inside a comment. We ignore this group. - - \"[^\r\n]*print\(: Match print statement inside a string. We ignore this group. - - \"\"\".*?print\(.*?\"\"\"": Match print statement inside a triple-quoted string. Uses re.DOTALL to also match newlines with ".". - We ignore this group. - - (print\()): Match print statement. - """ - - with open(filepath, encoding="utf-8") as input_file: - regexp = re.compile(r"#[^\r\n]*print\(|\"[^\r\n]*print\(|\"\"\".*?print\(.*?\"\"\"|(print\()", re.DOTALL) - input_text = input_file.read() - # use `re.finditer` to handle the case where the ignored groups would be matched first by `re.search` - matches = regexp.finditer(input_text) - - matches = [match for match in matches if match is not None and match.group(1) is not None] - return matches[0] if matches else None - - def test_no_encoding_on_file_open(self): - dataset_paths = Path("./datasets") - dataset_files = list(dataset_paths.absolute().glob("**/*.py")) - - for dataset in dataset_files: - if self._no_encoding_on_file_open(str(dataset)): - raise AssertionError(f"open(...) must use utf-8 encoding in {dataset}") - - def test_no_print_statements(self): - dataset_paths = Path("./datasets") - dataset_files = list(dataset_paths.absolute().glob("**/*.py")) - - for dataset in dataset_files: - if self._no_print_statements(str(dataset)): - raise AssertionError(f"print statement found in {dataset}. Use datasets.logger/logging instead.") diff --git a/utils/release.py b/utils/release.py index 6036a49f12c..04a0cf02793 100644 --- a/utils/release.py +++ b/utils/release.py @@ -13,38 +13,92 @@ # limitations under the License. import argparse +import re +import packaging.version -CUSTOM_JS_FILE = "docs/source/_static/js/custom.js" +REPLACE_PATTERNS = { + "init": (re.compile(r'^__version__\s+=\s+"([^"]+)"\s*$', re.MULTILINE), '__version__ = "VERSION"\n'), + "setup": (re.compile(r'^(\s*)version\s*=\s*"[^"]+",', re.MULTILINE), r'\1version="VERSION",'), +} +REPLACE_FILES = { + "init": "src/datasets/__init__.py", + "setup": "setup.py", +} -def update_custom_js(version): - """Update the version table in the custom.js file.""" - with open(CUSTOM_JS_FILE, encoding="utf-8", newline="\n") as f: - lines = f.readlines() - index = 0 - # First let's put the right version - while not lines[index].startswith("const stableVersion ="): - index += 1 - lines[index] = f'const stableVersion = "v{version}"\n' +def update_version_in_file(fname, version, pattern): + """Update the version in one file using a specific pattern.""" + with open(fname, "r", encoding="utf-8", newline="\n") as f: + code = f.read() + re_pattern, replace = REPLACE_PATTERNS[pattern] + replace = replace.replace("VERSION", version) + code = re_pattern.sub(replace, code) + with open(fname, "w", encoding="utf-8", newline="\n") as f: + f.write(code) - # Then update the dictionary - while not lines[index].startswith("const versionMapping = {"): - index += 1 - # We go until the end - while not lines[index].startswith("}"): - index += 1 - # We add the new version at the end - lines[index - 1] += f' "v{version}": "v{version}",\n' +def global_version_update(version): + """Update the version in all needed files.""" + for pattern, fname in REPLACE_FILES.items(): + update_version_in_file(fname, version, pattern) - with open(CUSTOM_JS_FILE, "w", encoding="utf-8", newline="\n") as f: - f.writelines(lines) + +def get_version(): + """Reads the current version in the __init__.""" + with open(REPLACE_FILES["init"], "r") as f: + code = f.read() + default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0] + return packaging.version.parse(default_version) + + +def pre_release_work(patch=False): + """Do all the necessary pre-release steps.""" + # First let's get the default version: base version if we are in dev, bump minor otherwise. + default_version = get_version() + if patch and default_version.is_devrelease: + raise ValueError("Can't create a patch version from the dev branch, checkout a released version!") + if default_version.is_devrelease: + default_version = default_version.base_version + elif patch: + default_version = f"{default_version.major}.{default_version.minor}.{default_version.micro + 1}" + else: + default_version = f"{default_version.major}.{default_version.minor + 1}.0" + + # Now let's ask nicely if that's the right one. + version = input(f"Which version are you releasing? [{default_version}]") + if len(version) == 0: + version = default_version + + print(f"Updating version to {version}.") + global_version_update(version) + + +def post_release_work(): + """Do all the necesarry post-release steps.""" + # First let's get the current version + current_version = get_version() + dev_version = f"{current_version.major}.{current_version.minor + 1}.0.dev0" + current_version = current_version.base_version + + # Check with the user we got that right. + version = input(f"Which version are we developing now? [{dev_version}]") + if len(version) == 0: + version = dev_version + + print(f"Updating version to {version}.") + global_version_update(version) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--version", help="Release version.") + parser.add_argument("--post_release", action="store_true", help="Whether or not this is post release.") + parser.add_argument("--patch", action="store_true", help="Whether or not this is a patch release.") args = parser.parse_args() - update_custom_js(args.version) + if not args.post_release: + pre_release_work(patch=args.patch) + elif args.patch: + print("Nothing to do after a patch :-)") + else: + post_release_work()