diff --git a/tests/common.py b/tests/common.py index 25b2f6e1..7b73e3ca 100644 --- a/tests/common.py +++ b/tests/common.py @@ -13,6 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import codecs import contextlib from io import StringIO import os @@ -20,6 +21,8 @@ import sys import tempfile import unittest +import warnings +from codecs import CodecInfo import yaml @@ -27,6 +30,151 @@ from yamllint.config import YamlLintConfig +# Encoding related stuff: +UTF_CODECS = ( + 'utf_32_be', + 'utf_32_be_sig', + 'utf_32_le', + 'utf_32_le_sig', + 'utf_16_be', + 'utf_16_be_sig', + 'utf_16_le', + 'utf_16_le_sig', + 'utf_8', + 'utf_8_sig' +) + + +def encode_utf_32_be_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors), + len(obj) + ) + + +def encode_utf_32_le_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors), + len(obj) + ) + + +def encode_utf_16_be_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors), + len(obj) + ) + + +def encode_utf_16_le_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors), + len(obj) + ) + + +test_codec_infos = { + 'utf_32_be_sig': CodecInfo(encode_utf_32_be_sig, codecs.getdecoder('utf_32')), # noqa: E501 + 'utf_32_le_sig': CodecInfo(encode_utf_32_le_sig, codecs.getdecoder('utf_32')), # noqa: E501 + 'utf_16_be_sig': CodecInfo(encode_utf_16_be_sig, codecs.getdecoder('utf_16')), # noqa: E501 + 'utf_16_le_sig': CodecInfo(encode_utf_16_le_sig, codecs.getdecoder('utf_16')), # noqa: E501 +} + + +def register_test_codecs(): + codecs.register(test_codec_infos.get) + + +def unregister_test_codecs(): + if sys.version_info >= (3, 10, 0): + codecs.unregister(test_codec_infos.get) + else: + warnings.warn( + "This version of Python doesn’t allow us to unregister codecs.", + stacklevel=1 + ) + + +def is_test_codec(codec): + return codec in test_codec_infos.keys() + + +def test_codec_built_in_equivalent(test_codec): + return_value = test_codec + for suffix in ('_sig', '_be', '_le'): + return_value = return_value.replace(suffix, '') + return return_value + + +def uses_bom(codec): + for suffix in ('_32', '_16', '_sig'): + if codec.endswith(suffix): + return True + return False + + +def encoding_detectable(string, codec): + """ + Returns True if encoding can be detected after string is encoded + + Encoding detection only works if you’re using a BOM or the first character + is ASCII. See yamllint.decoder.auto_decode()’s docstring. + """ + return uses_bom(codec) or (len(string) > 0 and string[0].isascii()) + + +# Workspace related stuff: +class Blob: + def __init__(self, text, encoding): + self.text = text + self.encoding = encoding + + +def build_temp_workspace(files): + tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') + + for path, content in files.items(): + path = os.fsencode(os.path.join(tempdir, path)) + if not os.path.exists(os.path.dirname(path)): + os.makedirs(os.path.dirname(path)) + + if isinstance(content, list): + os.mkdir(path) + elif isinstance(content, str) and content.startswith('symlink://'): + os.symlink(content[10:], path) + else: + if isinstance(content, Blob): + content = content.text.encode(content.encoding) + mode = 'wb' if isinstance(content, bytes) else 'w' + with open(path, mode) as f: + f.write(content) + + return tempdir + + +@contextlib.contextmanager +def temp_workspace(files): + """Provide a temporary workspace that is automatically cleaned up.""" + backup_wd = os.getcwd() + wd = build_temp_workspace(files) + + try: + os.chdir(wd) + yield + finally: + os.chdir(backup_wd) + shutil.rmtree(wd) + + +def temp_workspace_with_files_in_many_codecs(path_template, text): + workspace = {} + for codec in UTF_CODECS: + if encoding_detectable(text, codec): + workspace[path_template.format(codec)] = Blob(text, codec) + return workspace + + +# Miscellaneous stuff: class RuleTestCase(unittest.TestCase): def build_fake_config(self, conf): if conf is None: @@ -81,37 +229,3 @@ def __exit__(self, *exc_info): @property def returncode(self): return self._raises_ctx.exception.code - - -def build_temp_workspace(files): - tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') - - for path, content in files.items(): - path = os.fsencode(os.path.join(tempdir, path)) - if not os.path.exists(os.path.dirname(path)): - os.makedirs(os.path.dirname(path)) - - if isinstance(content, list): - os.mkdir(path) - elif isinstance(content, str) and content.startswith('symlink://'): - os.symlink(content[10:], path) - else: - mode = 'wb' if isinstance(content, bytes) else 'w' - with open(path, mode) as f: - f.write(content) - - return tempdir - - -@contextlib.contextmanager -def temp_workspace(files): - """Provide a temporary workspace that is automatically cleaned up.""" - backup_wd = os.getcwd() - wd = build_temp_workspace(files) - - try: - os.chdir(wd) - yield - finally: - os.chdir(backup_wd) - shutil.rmtree(wd) diff --git a/tests/test_cli.py b/tests/test_cli.py index 765d7083..51825efc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -23,7 +23,14 @@ import unittest from io import StringIO -from tests.common import build_temp_workspace, RunContext, temp_workspace +from tests.common import ( + build_temp_workspace, + register_test_codecs, + RunContext, + temp_workspace, + unregister_test_codecs, + temp_workspace_with_files_in_many_codecs, +) from yamllint import cli, config @@ -819,3 +826,52 @@ def test_multiple_parent_config_file(self): self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, './4spaces.yml:2:5: [warning] wrong indentation: ' 'expected 3 but found 4 (indentation)\n', '')) + + +class CommandLineEncodingTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + register_test_codecs() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + unregister_test_codecs() + + def test_valid_encodings(self): + conf = ('---\n' + 'rules:\n' + ' key-ordering: enable\n') + config_files = temp_workspace_with_files_in_many_codecs( + 'config_{}.yaml', + conf + ) + sorted_correctly = ('---\n' + 'A: YAML\n' + 'Z: YAML\n') + sorted_correctly_files = temp_workspace_with_files_in_many_codecs( + 'sorted_correctly/{}.yaml', + sorted_correctly + ) + sorted_incorrectly = ('---\n' + 'Z: YAML\n' + 'A: YAML\n') + sorted_incorrectly_files = temp_workspace_with_files_in_many_codecs( + 'sorted_incorrectly/{}.yaml', + sorted_incorrectly + ) + workspace = { + **config_files, + **sorted_correctly_files, + **sorted_incorrectly_files + } + + with temp_workspace(workspace): + for config_path in config_files.keys(): + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_correctly/')) + self.assertEqual(ctx.returncode, 0) + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_incorrectly/')) + self.assertNotEqual(ctx.returncode, 0) diff --git a/tests/test_decoder.py b/tests/test_decoder.py new file mode 100644 index 00000000..f7ef1650 --- /dev/null +++ b/tests/test_decoder.py @@ -0,0 +1,452 @@ +# Copyright (C) 2023–2024 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs +import unittest + +from tests.common import ( + UTF_CODECS, + encoding_detectable, + is_test_codec, + register_test_codecs, + test_codec_built_in_equivalent, + unregister_test_codecs, + uses_bom, +) + +from yamllint import decoder + + +class PreEncodedTestStringInfo(): + def __init__( + self, + input_bytes, + codec_for_input_bytes, + expected_output_str + ): + self.input_bytes = input_bytes + self.codec_for_input_bytes = codec_for_input_bytes + self.expected_output_str = expected_output_str + + +PRE_ENCODED_TEST_STRING_INFOS = ( + # An empty string + PreEncodedTestStringInfo( + b'', + None, + '' + ), + + # A single ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\x00|', + 'utf_32_be', + '|' + ), + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00|', + 'utf_32', + '|' + ), + PreEncodedTestStringInfo( + b'|\x00\x00\x00', + 'utf_32_le', + '|' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00|\x00\x00\x00', + 'utf_32', # LE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'\x00|', + 'utf_16_be', + '|' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00|', + 'utf_16', # BE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'|\x00', + 'utf_16_le', + '|' + ), + PreEncodedTestStringInfo( + b'\xff\xfe|\x00', + 'utf_16', # LE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'|', + 'utf_8', + '|' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf|', + 'utf_8_sig', + '|' + ), + + # A string that starts with an ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501 + 'utf_32_be', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501 + 'utf_32', # BE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32_le', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32', # LE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?', + 'utf_16_be', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?', + 'utf_16', # BE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'W\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00', + 'utf_16_le', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xff\xfeW\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00', + 'utf_16', # LE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'What\xe2\x80\x99s up?', + 'utf_8', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbfWhat\xe2\x80\x99s up?', + 'utf_8_sig', + 'What’s up?' + ), + + # A single non-ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x01\xf4;', + 'utf_32', # BE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00;\xf4\x01\x00', + 'utf_32', # LE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\xd8=\xdc;', + 'utf_16', # BE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xff\xfe=\xd8;\xdc', + 'utf_16', # LE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf\xf0\x9f\x90\xbb', + 'utf_8_sig', + '🐻' + ), + + # A string that starts with a non-ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?', # noqa: E501 + 'utf_32', # BE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32', # LE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00\xc7\x00a\x00 \x00v\x00a\x00?', + 'utf_16', # BE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\xc7\x00a\x00 \x00v\x00a\x00?\x00', + 'utf_16', # LE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf\xc3\x87a va?', + 'utf_8_sig', + 'Ça va?' + ) +) +TEST_STRINGS_TO_ENCODE_AT_RUNTIME = ( + "", + "y", + "yaml", + "🇾⁠🇦⁠🇲⁠🇱⁠❗" +) +setUpModule = register_test_codecs +tearDownModule = unregister_test_codecs + + +class EncodingStuffFromCommonTestCase(unittest.TestCase): + def test_test_codecs_and_utf_codecs(self): + error = "{} failed to correctly encode then decode {}." + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + self.assertEqual( + string, + string.encode(codec).decode(codec), + msg=error.format(repr(codec), repr(string)) + ) + + def test_is_test_codec(self): + self.assertFalse(is_test_codec('utf_32')) + self.assertFalse(is_test_codec('utf_32_be')) + self.assertTrue(is_test_codec('utf_32_be_sig')) + self.assertFalse(is_test_codec('utf_32_le')) + self.assertTrue(is_test_codec('utf_32_le_sig')) + + self.assertFalse(is_test_codec('utf_16')) + self.assertFalse(is_test_codec('utf_16_be')) + self.assertTrue(is_test_codec('utf_16_be_sig')) + self.assertFalse(is_test_codec('utf_16_le')) + self.assertTrue(is_test_codec('utf_16_le_sig')) + + self.assertFalse(is_test_codec('utf_8')) + self.assertFalse(is_test_codec('utf_8_be')) + + def test_test_codec_built_in_equivalent(self): + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_be_sig') + ) + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_le_sig') + ) + + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_be_sig') + ) + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_le_sig') + ) + + def test_uses_bom(self): + self.assertTrue(uses_bom('utf_32')) + self.assertFalse(uses_bom('utf_32_be')) + self.assertTrue(uses_bom('utf_32_be_sig')) + self.assertFalse(uses_bom('utf_32_le')) + self.assertTrue(uses_bom('utf_32_le_sig')) + + self.assertTrue(uses_bom('utf_16')) + self.assertFalse(uses_bom('utf_16_be')) + self.assertTrue(uses_bom('utf_16_be_sig')) + self.assertFalse(uses_bom('utf_16_le')) + self.assertTrue(uses_bom('utf_16_le_sig')) + + self.assertFalse(uses_bom('utf_8')) + self.assertTrue(uses_bom('utf_8_sig')) + + def test_encoding_detectable(self): + # No BOM + nothing + self.assertFalse(encoding_detectable('', 'utf_32_be')) + self.assertFalse(encoding_detectable('', 'utf_32_le')) + + self.assertFalse(encoding_detectable('', 'utf_16_be')) + self.assertFalse(encoding_detectable('', 'utf_16_le')) + + self.assertFalse(encoding_detectable('', 'utf_8')) + # BOM + nothing + self.assertTrue(encoding_detectable('', 'utf_32')) + self.assertTrue(encoding_detectable('', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_16')) + self.assertTrue(encoding_detectable('', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_8_sig')) + # No BOM + non-ASCII + self.assertFalse(encoding_detectable('Ⓝⓔ', 'utf_32_be')) + self.assertFalse(encoding_detectable('ⓥⓔ', 'utf_32_le')) + + self.assertFalse(encoding_detectable('ⓡ ', 'utf_16_be')) + self.assertFalse(encoding_detectable('ⓖⓞ', 'utf_16_le')) + + self.assertFalse(encoding_detectable('ⓝⓝ', 'utf_8')) + # No BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32_be')) + self.assertTrue(encoding_detectable('gi', 'utf_32_le')) + + self.assertTrue(encoding_detectable('ve', 'utf_16_be')) + self.assertTrue(encoding_detectable(' y', 'utf_16_le')) + + self.assertTrue(encoding_detectable('ou', 'utf_8')) + # BOM + non-ASCII + self.assertTrue(encoding_detectable('␣ⓤ', 'utf_32')) + self.assertTrue(encoding_detectable('ⓟ␤', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('Ⓝⓔ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('ⓥⓔ', 'utf_16')) + self.assertTrue(encoding_detectable('ⓡ␣', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('ⓖⓞ', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('ⓝⓝ', 'utf_8_sig')) + # BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32')) + self.assertTrue(encoding_detectable('le', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('t ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('yo', 'utf_16')) + self.assertTrue(encoding_detectable('u ', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('do', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('wn', 'utf_8_sig')) + + +class DecoderTestCase(unittest.TestCase): + def detect_encoding_test_helper( + self, + original_string, + input_bytes, + expected_output + ): + ERROR1 = "{} was encoded with {}, but detect_encoding() returned {}." + ERROR2 = "detect_encoding({}) returned a codec that isn’t built-in." + actual_output = decoder.detect_encoding(input_bytes) + if expected_output is not None: + self.assertEqual( + expected_output, + actual_output, + msg=ERROR1.format( + input_bytes, + repr(expected_output), + repr(actual_output) + ) + ) + + codec_info = codecs.lookup(actual_output) + self.assertFalse( + is_test_codec(codec_info), + msg=ERROR2.format(input_bytes) + ) + + def test_detect_encoding_with_pre_encoded_strings(self): + for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS: + self.detect_encoding_test_helper( + pre_encoded_test_string_info.expected_output_str, + pre_encoded_test_string_info.input_bytes, + pre_encoded_test_string_info.codec_for_input_bytes + ) + + def test_detect_encoding_with_strings_encoded_at_runtime(self): + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + if not uses_bom(codec) and len(string) == 0: + expected_output = 'utf_8' + elif not encoding_detectable(string, codec): + expected_output = None + elif is_test_codec(codec): + expected_output = test_codec_built_in_equivalent(codec) + else: + expected_output = codec + self.detect_encoding_test_helper( + string, + string.encode(codec), + expected_output + ) + + def auto_decode_test_helper( + self, + input_bytes, + codec_for_input_bytes, + expected_output + ): + ERROR = "auto_decode({}) returned the wrong value." + does_auto_detect_encodings_return_value_matter = ( + codec_for_input_bytes is not None and ( + encoding_detectable(expected_output, codec_for_input_bytes) + or len(input_bytes) == 0 + ) + ) + if does_auto_detect_encodings_return_value_matter: + actual_output = decoder.auto_decode(input_bytes) + self.assertEqual( + expected_output, + actual_output, + msg=ERROR.format(repr(input_bytes)) + ) + self.assertIsInstance(actual_output, str) + else: + try: + decoder.auto_decode(input_bytes) + except UnicodeDecodeError as exception: + return exception + return None + + def test_auto_decode_with_pre_encoded_strings(self): + ERROR = "auto_decode({}) should not have raised an exception" + for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS: + exception = self.auto_decode_test_helper( + pre_encoded_test_string_info.input_bytes, + pre_encoded_test_string_info.codec_for_input_bytes, + pre_encoded_test_string_info.expected_output_str + ) + if exception is not None: + new_exception = self.failureException( + msg=ERROR.format( + repr(pre_encoded_test_string_info.input_bytes) + ) + ) + raise new_exception from exception + + def test_auto_decode_with_strings_encoded_at_runtime(self): + at_least_one_decode_error = False + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + exception = self.auto_decode_test_helper( + string.encode(codec), + codec, + string + ) + if exception is not None: + at_least_one_decode_error = True + self.assertTrue( + at_least_one_decode_error, + msg=( + "None of the TEST_STRINGS_TO_ENCODE_AT_RUNTIME triggered a " + + "decoding error." + ) + ) diff --git a/yamllint/cli.py b/yamllint/cli.py index 9a39bd8c..7059b852 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -219,7 +219,7 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - with open(file, newline='') as f: + with open(file, mode='rb') as f: problems = linter.run(f, conf, filepath) except OSError as e: print(e, file=sys.stderr) diff --git a/yamllint/config.py b/yamllint/config.py index 9ce62549..c40d8205 100644 --- a/yamllint/config.py +++ b/yamllint/config.py @@ -20,6 +20,7 @@ import yaml import yamllint.rules +from yamllint import decoder class YamlLintConfigError(Exception): @@ -38,8 +39,8 @@ def __init__(self, content=None, file=None): self.locale = None if file is not None: - with open(file) as f: - content = f.read() + with open(file, mode='rb') as f: + content = decoder.auto_decode(f.read()) self.parse(content) self.validate() diff --git a/yamllint/decoder.py b/yamllint/decoder.py new file mode 100644 index 00000000..1e3c2f32 --- /dev/null +++ b/yamllint/decoder.py @@ -0,0 +1,65 @@ +# Copyright (C) 2023 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs + + +def detect_encoding(stream_data): + """ + Return stream_data’s character encoding + + Specifically, this function will take a bytes object and return a string + that contains the name of one of Python’s built-in codecs [1]. + + The YAML spec says that streams must begin with a BOM or an ASCII + character. If stream_data doesn’t begin with either of those, then this + function might return the wrong encoding. See chapter 5.2 of the YAML spec + for details [2]. + + [1]: + [2]: + """ + if stream_data.startswith(codecs.BOM_UTF32_BE): + return 'utf_32' + elif stream_data.startswith(b'\x00\x00\x00') and len(stream_data) >= 4: + return 'utf_32_be' + elif stream_data.startswith(codecs.BOM_UTF32_LE): + return 'utf_32' + elif stream_data[1:4] == b'\x00\x00\x00': + return 'utf_32_le' + elif stream_data.startswith(codecs.BOM_UTF16_BE): + return 'utf_16' + elif stream_data.startswith(b'\x00') and len(stream_data) >= 2: + return 'utf_16_be' + elif stream_data.startswith(codecs.BOM_UTF16_LE): + return 'utf_16' + elif stream_data[1:2] == b'\x00': + return 'utf_16_le' + elif stream_data.startswith(codecs.BOM_UTF8): + return 'utf_8_sig' + else: + return 'utf_8' + + +def auto_decode(stream_data): + return stream_data.decode(encoding=detect_encoding(stream_data)) + + +def lines_in_files(paths): + """Autodecodes files and yields their lines.""" + for path in paths: + with open(path, 'rb') as file: + text = auto_decode(file.read()) + yield from text.splitlines() diff --git a/yamllint/linter.py b/yamllint/linter.py index a2faa061..2230a600 100644 --- a/yamllint/linter.py +++ b/yamllint/linter.py @@ -18,7 +18,7 @@ import yaml -from yamllint import parser +from yamllint import decoder, parser PROBLEM_LEVELS = { 0: None, @@ -187,6 +187,8 @@ def get_syntax_error(buffer): def _run(buffer, conf, filepath): assert hasattr(buffer, '__getitem__'), \ '_run() argument must be a buffer, not a stream' + if isinstance(buffer, bytes): + buffer = decoder.auto_decode(buffer) first_line = next(parser.line_generator(buffer)).content if re.match(r'^#\s*yamllint disable-file\s*$', first_line):