From e5ef0391c0dd9de41ac69f2990821d6b3221a732 Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Sat, 30 Dec 2023 12:51:24 -0500 Subject: [PATCH] decoder: Autodetect detect encoding of YAML files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this change, yamllint would open YAML files using open()’s default encoding. As long as UTF-8 mode isn’t enabled, open() defaults to using the system’s locale encoding [1][2]. This can cause problems in multiple different scenarios. The first scenario involves linting UTF-8 YAML files on Linux systems. Most of the time, the locale encoding on Linux systems is set to UTF-8 [3][4], but it can be set to something else [5]. In the unlikely event that someone was using Linux with a locale encoding other than UTF-8, there was a chance that yamllint would crash with a UnicodeDecodeError. The second scenario involves linting UTF-8 YAML files on Windows systems. The locale encoding on Windows systems is the system’s ANSI code page [6]. The ANSI code page on Windows systems is NOT set to UTF-8 by default [7]. In the very likely event that someone was using Windows with a locale encoding other than UTF-8, there was a chance that yamllint would crash with a UnicodeDecodeError. Additionally, using open()’s default encoding is a violation of the YAML spec. Chapter 5.2 says: “On input, a YAML processor must support the UTF-8 and UTF-16 character encodings. For JSON compatibility, the UTF-32 encodings must also be supported. If a character stream begins with a byte order mark, the character encoding will be taken to be as indicated by the byte order mark. Otherwise, the stream must begin with an ASCII character. This allows the encoding to be deduced by the pattern of null (x00) characters.” [8] This change fixes all of those problems by implementing the YAML spec’s character encoding detection algorithm. Now, as long as YAML files begin with either a byte order mark or an ASCII character, yamllint will automatically detect them as being UTF-8, UTF-16 or UTF-32. Other character encodings are not supported at the moment. Credit for the idea of having tests with pre-encoded strings goes to @adrienverge [9]. Fixes #218. Fixes #238. Fixes #347. [1]: [2]: [3]: [4]: [5]: [6]: [7]: [8]: [9]: --- tests/common.py | 182 +++++++++++++---- tests/test_cli.py | 58 +++++- tests/test_decoder.py | 452 ++++++++++++++++++++++++++++++++++++++++++ yamllint/cli.py | 2 +- yamllint/config.py | 5 +- yamllint/decoder.py | 65 ++++++ yamllint/linter.py | 4 +- 7 files changed, 729 insertions(+), 39 deletions(-) create mode 100644 tests/test_decoder.py create mode 100644 yamllint/decoder.py diff --git a/tests/common.py b/tests/common.py index 25b2f6e1..7b73e3ca 100644 --- a/tests/common.py +++ b/tests/common.py @@ -13,6 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import codecs import contextlib from io import StringIO import os @@ -20,6 +21,8 @@ import sys import tempfile import unittest +import warnings +from codecs import CodecInfo import yaml @@ -27,6 +30,151 @@ from yamllint.config import YamlLintConfig +# Encoding related stuff: +UTF_CODECS = ( + 'utf_32_be', + 'utf_32_be_sig', + 'utf_32_le', + 'utf_32_le_sig', + 'utf_16_be', + 'utf_16_be_sig', + 'utf_16_le', + 'utf_16_le_sig', + 'utf_8', + 'utf_8_sig' +) + + +def encode_utf_32_be_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors), + len(obj) + ) + + +def encode_utf_32_le_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors), + len(obj) + ) + + +def encode_utf_16_be_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors), + len(obj) + ) + + +def encode_utf_16_le_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors), + len(obj) + ) + + +test_codec_infos = { + 'utf_32_be_sig': CodecInfo(encode_utf_32_be_sig, codecs.getdecoder('utf_32')), # noqa: E501 + 'utf_32_le_sig': CodecInfo(encode_utf_32_le_sig, codecs.getdecoder('utf_32')), # noqa: E501 + 'utf_16_be_sig': CodecInfo(encode_utf_16_be_sig, codecs.getdecoder('utf_16')), # noqa: E501 + 'utf_16_le_sig': CodecInfo(encode_utf_16_le_sig, codecs.getdecoder('utf_16')), # noqa: E501 +} + + +def register_test_codecs(): + codecs.register(test_codec_infos.get) + + +def unregister_test_codecs(): + if sys.version_info >= (3, 10, 0): + codecs.unregister(test_codec_infos.get) + else: + warnings.warn( + "This version of Python doesn’t allow us to unregister codecs.", + stacklevel=1 + ) + + +def is_test_codec(codec): + return codec in test_codec_infos.keys() + + +def test_codec_built_in_equivalent(test_codec): + return_value = test_codec + for suffix in ('_sig', '_be', '_le'): + return_value = return_value.replace(suffix, '') + return return_value + + +def uses_bom(codec): + for suffix in ('_32', '_16', '_sig'): + if codec.endswith(suffix): + return True + return False + + +def encoding_detectable(string, codec): + """ + Returns True if encoding can be detected after string is encoded + + Encoding detection only works if you’re using a BOM or the first character + is ASCII. See yamllint.decoder.auto_decode()’s docstring. + """ + return uses_bom(codec) or (len(string) > 0 and string[0].isascii()) + + +# Workspace related stuff: +class Blob: + def __init__(self, text, encoding): + self.text = text + self.encoding = encoding + + +def build_temp_workspace(files): + tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') + + for path, content in files.items(): + path = os.fsencode(os.path.join(tempdir, path)) + if not os.path.exists(os.path.dirname(path)): + os.makedirs(os.path.dirname(path)) + + if isinstance(content, list): + os.mkdir(path) + elif isinstance(content, str) and content.startswith('symlink://'): + os.symlink(content[10:], path) + else: + if isinstance(content, Blob): + content = content.text.encode(content.encoding) + mode = 'wb' if isinstance(content, bytes) else 'w' + with open(path, mode) as f: + f.write(content) + + return tempdir + + +@contextlib.contextmanager +def temp_workspace(files): + """Provide a temporary workspace that is automatically cleaned up.""" + backup_wd = os.getcwd() + wd = build_temp_workspace(files) + + try: + os.chdir(wd) + yield + finally: + os.chdir(backup_wd) + shutil.rmtree(wd) + + +def temp_workspace_with_files_in_many_codecs(path_template, text): + workspace = {} + for codec in UTF_CODECS: + if encoding_detectable(text, codec): + workspace[path_template.format(codec)] = Blob(text, codec) + return workspace + + +# Miscellaneous stuff: class RuleTestCase(unittest.TestCase): def build_fake_config(self, conf): if conf is None: @@ -81,37 +229,3 @@ def __exit__(self, *exc_info): @property def returncode(self): return self._raises_ctx.exception.code - - -def build_temp_workspace(files): - tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') - - for path, content in files.items(): - path = os.fsencode(os.path.join(tempdir, path)) - if not os.path.exists(os.path.dirname(path)): - os.makedirs(os.path.dirname(path)) - - if isinstance(content, list): - os.mkdir(path) - elif isinstance(content, str) and content.startswith('symlink://'): - os.symlink(content[10:], path) - else: - mode = 'wb' if isinstance(content, bytes) else 'w' - with open(path, mode) as f: - f.write(content) - - return tempdir - - -@contextlib.contextmanager -def temp_workspace(files): - """Provide a temporary workspace that is automatically cleaned up.""" - backup_wd = os.getcwd() - wd = build_temp_workspace(files) - - try: - os.chdir(wd) - yield - finally: - os.chdir(backup_wd) - shutil.rmtree(wd) diff --git a/tests/test_cli.py b/tests/test_cli.py index 765d7083..51825efc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -23,7 +23,14 @@ import unittest from io import StringIO -from tests.common import build_temp_workspace, RunContext, temp_workspace +from tests.common import ( + build_temp_workspace, + register_test_codecs, + RunContext, + temp_workspace, + unregister_test_codecs, + temp_workspace_with_files_in_many_codecs, +) from yamllint import cli, config @@ -819,3 +826,52 @@ def test_multiple_parent_config_file(self): self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, './4spaces.yml:2:5: [warning] wrong indentation: ' 'expected 3 but found 4 (indentation)\n', '')) + + +class CommandLineEncodingTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + register_test_codecs() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + unregister_test_codecs() + + def test_valid_encodings(self): + conf = ('---\n' + 'rules:\n' + ' key-ordering: enable\n') + config_files = temp_workspace_with_files_in_many_codecs( + 'config_{}.yaml', + conf + ) + sorted_correctly = ('---\n' + 'A: YAML\n' + 'Z: YAML\n') + sorted_correctly_files = temp_workspace_with_files_in_many_codecs( + 'sorted_correctly/{}.yaml', + sorted_correctly + ) + sorted_incorrectly = ('---\n' + 'Z: YAML\n' + 'A: YAML\n') + sorted_incorrectly_files = temp_workspace_with_files_in_many_codecs( + 'sorted_incorrectly/{}.yaml', + sorted_incorrectly + ) + workspace = { + **config_files, + **sorted_correctly_files, + **sorted_incorrectly_files + } + + with temp_workspace(workspace): + for config_path in config_files.keys(): + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_correctly/')) + self.assertEqual(ctx.returncode, 0) + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_incorrectly/')) + self.assertNotEqual(ctx.returncode, 0) diff --git a/tests/test_decoder.py b/tests/test_decoder.py new file mode 100644 index 00000000..f7ef1650 --- /dev/null +++ b/tests/test_decoder.py @@ -0,0 +1,452 @@ +# Copyright (C) 2023–2024 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs +import unittest + +from tests.common import ( + UTF_CODECS, + encoding_detectable, + is_test_codec, + register_test_codecs, + test_codec_built_in_equivalent, + unregister_test_codecs, + uses_bom, +) + +from yamllint import decoder + + +class PreEncodedTestStringInfo(): + def __init__( + self, + input_bytes, + codec_for_input_bytes, + expected_output_str + ): + self.input_bytes = input_bytes + self.codec_for_input_bytes = codec_for_input_bytes + self.expected_output_str = expected_output_str + + +PRE_ENCODED_TEST_STRING_INFOS = ( + # An empty string + PreEncodedTestStringInfo( + b'', + None, + '' + ), + + # A single ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\x00|', + 'utf_32_be', + '|' + ), + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00|', + 'utf_32', + '|' + ), + PreEncodedTestStringInfo( + b'|\x00\x00\x00', + 'utf_32_le', + '|' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00|\x00\x00\x00', + 'utf_32', # LE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'\x00|', + 'utf_16_be', + '|' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00|', + 'utf_16', # BE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'|\x00', + 'utf_16_le', + '|' + ), + PreEncodedTestStringInfo( + b'\xff\xfe|\x00', + 'utf_16', # LE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'|', + 'utf_8', + '|' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf|', + 'utf_8_sig', + '|' + ), + + # A string that starts with an ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501 + 'utf_32_be', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501 + 'utf_32', # BE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32_le', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32', # LE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?', + 'utf_16_be', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?', + 'utf_16', # BE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'W\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00', + 'utf_16_le', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xff\xfeW\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00', + 'utf_16', # LE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'What\xe2\x80\x99s up?', + 'utf_8', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbfWhat\xe2\x80\x99s up?', + 'utf_8_sig', + 'What’s up?' + ), + + # A single non-ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x01\xf4;', + 'utf_32', # BE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00;\xf4\x01\x00', + 'utf_32', # LE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\xd8=\xdc;', + 'utf_16', # BE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xff\xfe=\xd8;\xdc', + 'utf_16', # LE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf\xf0\x9f\x90\xbb', + 'utf_8_sig', + '🐻' + ), + + # A string that starts with a non-ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?', # noqa: E501 + 'utf_32', # BE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32', # LE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00\xc7\x00a\x00 \x00v\x00a\x00?', + 'utf_16', # BE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\xc7\x00a\x00 \x00v\x00a\x00?\x00', + 'utf_16', # LE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf\xc3\x87a va?', + 'utf_8_sig', + 'Ça va?' + ) +) +TEST_STRINGS_TO_ENCODE_AT_RUNTIME = ( + "", + "y", + "yaml", + "🇾⁠🇦⁠🇲⁠🇱⁠❗" +) +setUpModule = register_test_codecs +tearDownModule = unregister_test_codecs + + +class EncodingStuffFromCommonTestCase(unittest.TestCase): + def test_test_codecs_and_utf_codecs(self): + error = "{} failed to correctly encode then decode {}." + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + self.assertEqual( + string, + string.encode(codec).decode(codec), + msg=error.format(repr(codec), repr(string)) + ) + + def test_is_test_codec(self): + self.assertFalse(is_test_codec('utf_32')) + self.assertFalse(is_test_codec('utf_32_be')) + self.assertTrue(is_test_codec('utf_32_be_sig')) + self.assertFalse(is_test_codec('utf_32_le')) + self.assertTrue(is_test_codec('utf_32_le_sig')) + + self.assertFalse(is_test_codec('utf_16')) + self.assertFalse(is_test_codec('utf_16_be')) + self.assertTrue(is_test_codec('utf_16_be_sig')) + self.assertFalse(is_test_codec('utf_16_le')) + self.assertTrue(is_test_codec('utf_16_le_sig')) + + self.assertFalse(is_test_codec('utf_8')) + self.assertFalse(is_test_codec('utf_8_be')) + + def test_test_codec_built_in_equivalent(self): + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_be_sig') + ) + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_le_sig') + ) + + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_be_sig') + ) + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_le_sig') + ) + + def test_uses_bom(self): + self.assertTrue(uses_bom('utf_32')) + self.assertFalse(uses_bom('utf_32_be')) + self.assertTrue(uses_bom('utf_32_be_sig')) + self.assertFalse(uses_bom('utf_32_le')) + self.assertTrue(uses_bom('utf_32_le_sig')) + + self.assertTrue(uses_bom('utf_16')) + self.assertFalse(uses_bom('utf_16_be')) + self.assertTrue(uses_bom('utf_16_be_sig')) + self.assertFalse(uses_bom('utf_16_le')) + self.assertTrue(uses_bom('utf_16_le_sig')) + + self.assertFalse(uses_bom('utf_8')) + self.assertTrue(uses_bom('utf_8_sig')) + + def test_encoding_detectable(self): + # No BOM + nothing + self.assertFalse(encoding_detectable('', 'utf_32_be')) + self.assertFalse(encoding_detectable('', 'utf_32_le')) + + self.assertFalse(encoding_detectable('', 'utf_16_be')) + self.assertFalse(encoding_detectable('', 'utf_16_le')) + + self.assertFalse(encoding_detectable('', 'utf_8')) + # BOM + nothing + self.assertTrue(encoding_detectable('', 'utf_32')) + self.assertTrue(encoding_detectable('', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_16')) + self.assertTrue(encoding_detectable('', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_8_sig')) + # No BOM + non-ASCII + self.assertFalse(encoding_detectable('Ⓝⓔ', 'utf_32_be')) + self.assertFalse(encoding_detectable('ⓥⓔ', 'utf_32_le')) + + self.assertFalse(encoding_detectable('ⓡ ', 'utf_16_be')) + self.assertFalse(encoding_detectable('ⓖⓞ', 'utf_16_le')) + + self.assertFalse(encoding_detectable('ⓝⓝ', 'utf_8')) + # No BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32_be')) + self.assertTrue(encoding_detectable('gi', 'utf_32_le')) + + self.assertTrue(encoding_detectable('ve', 'utf_16_be')) + self.assertTrue(encoding_detectable(' y', 'utf_16_le')) + + self.assertTrue(encoding_detectable('ou', 'utf_8')) + # BOM + non-ASCII + self.assertTrue(encoding_detectable('␣ⓤ', 'utf_32')) + self.assertTrue(encoding_detectable('ⓟ␤', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('Ⓝⓔ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('ⓥⓔ', 'utf_16')) + self.assertTrue(encoding_detectable('ⓡ␣', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('ⓖⓞ', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('ⓝⓝ', 'utf_8_sig')) + # BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32')) + self.assertTrue(encoding_detectable('le', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('t ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('yo', 'utf_16')) + self.assertTrue(encoding_detectable('u ', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('do', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('wn', 'utf_8_sig')) + + +class DecoderTestCase(unittest.TestCase): + def detect_encoding_test_helper( + self, + original_string, + input_bytes, + expected_output + ): + ERROR1 = "{} was encoded with {}, but detect_encoding() returned {}." + ERROR2 = "detect_encoding({}) returned a codec that isn’t built-in." + actual_output = decoder.detect_encoding(input_bytes) + if expected_output is not None: + self.assertEqual( + expected_output, + actual_output, + msg=ERROR1.format( + input_bytes, + repr(expected_output), + repr(actual_output) + ) + ) + + codec_info = codecs.lookup(actual_output) + self.assertFalse( + is_test_codec(codec_info), + msg=ERROR2.format(input_bytes) + ) + + def test_detect_encoding_with_pre_encoded_strings(self): + for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS: + self.detect_encoding_test_helper( + pre_encoded_test_string_info.expected_output_str, + pre_encoded_test_string_info.input_bytes, + pre_encoded_test_string_info.codec_for_input_bytes + ) + + def test_detect_encoding_with_strings_encoded_at_runtime(self): + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + if not uses_bom(codec) and len(string) == 0: + expected_output = 'utf_8' + elif not encoding_detectable(string, codec): + expected_output = None + elif is_test_codec(codec): + expected_output = test_codec_built_in_equivalent(codec) + else: + expected_output = codec + self.detect_encoding_test_helper( + string, + string.encode(codec), + expected_output + ) + + def auto_decode_test_helper( + self, + input_bytes, + codec_for_input_bytes, + expected_output + ): + ERROR = "auto_decode({}) returned the wrong value." + does_auto_detect_encodings_return_value_matter = ( + codec_for_input_bytes is not None and ( + encoding_detectable(expected_output, codec_for_input_bytes) + or len(input_bytes) == 0 + ) + ) + if does_auto_detect_encodings_return_value_matter: + actual_output = decoder.auto_decode(input_bytes) + self.assertEqual( + expected_output, + actual_output, + msg=ERROR.format(repr(input_bytes)) + ) + self.assertIsInstance(actual_output, str) + else: + try: + decoder.auto_decode(input_bytes) + except UnicodeDecodeError as exception: + return exception + return None + + def test_auto_decode_with_pre_encoded_strings(self): + ERROR = "auto_decode({}) should not have raised an exception" + for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS: + exception = self.auto_decode_test_helper( + pre_encoded_test_string_info.input_bytes, + pre_encoded_test_string_info.codec_for_input_bytes, + pre_encoded_test_string_info.expected_output_str + ) + if exception is not None: + new_exception = self.failureException( + msg=ERROR.format( + repr(pre_encoded_test_string_info.input_bytes) + ) + ) + raise new_exception from exception + + def test_auto_decode_with_strings_encoded_at_runtime(self): + at_least_one_decode_error = False + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + exception = self.auto_decode_test_helper( + string.encode(codec), + codec, + string + ) + if exception is not None: + at_least_one_decode_error = True + self.assertTrue( + at_least_one_decode_error, + msg=( + "None of the TEST_STRINGS_TO_ENCODE_AT_RUNTIME triggered a " + + "decoding error." + ) + ) diff --git a/yamllint/cli.py b/yamllint/cli.py index 9a39bd8c..7059b852 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -219,7 +219,7 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - with open(file, newline='') as f: + with open(file, mode='rb') as f: problems = linter.run(f, conf, filepath) except OSError as e: print(e, file=sys.stderr) diff --git a/yamllint/config.py b/yamllint/config.py index 9ce62549..c40d8205 100644 --- a/yamllint/config.py +++ b/yamllint/config.py @@ -20,6 +20,7 @@ import yaml import yamllint.rules +from yamllint import decoder class YamlLintConfigError(Exception): @@ -38,8 +39,8 @@ def __init__(self, content=None, file=None): self.locale = None if file is not None: - with open(file) as f: - content = f.read() + with open(file, mode='rb') as f: + content = decoder.auto_decode(f.read()) self.parse(content) self.validate() diff --git a/yamllint/decoder.py b/yamllint/decoder.py new file mode 100644 index 00000000..1e3c2f32 --- /dev/null +++ b/yamllint/decoder.py @@ -0,0 +1,65 @@ +# Copyright (C) 2023 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs + + +def detect_encoding(stream_data): + """ + Return stream_data’s character encoding + + Specifically, this function will take a bytes object and return a string + that contains the name of one of Python’s built-in codecs [1]. + + The YAML spec says that streams must begin with a BOM or an ASCII + character. If stream_data doesn’t begin with either of those, then this + function might return the wrong encoding. See chapter 5.2 of the YAML spec + for details [2]. + + [1]: + [2]: + """ + if stream_data.startswith(codecs.BOM_UTF32_BE): + return 'utf_32' + elif stream_data.startswith(b'\x00\x00\x00') and len(stream_data) >= 4: + return 'utf_32_be' + elif stream_data.startswith(codecs.BOM_UTF32_LE): + return 'utf_32' + elif stream_data[1:4] == b'\x00\x00\x00': + return 'utf_32_le' + elif stream_data.startswith(codecs.BOM_UTF16_BE): + return 'utf_16' + elif stream_data.startswith(b'\x00') and len(stream_data) >= 2: + return 'utf_16_be' + elif stream_data.startswith(codecs.BOM_UTF16_LE): + return 'utf_16' + elif stream_data[1:2] == b'\x00': + return 'utf_16_le' + elif stream_data.startswith(codecs.BOM_UTF8): + return 'utf_8_sig' + else: + return 'utf_8' + + +def auto_decode(stream_data): + return stream_data.decode(encoding=detect_encoding(stream_data)) + + +def lines_in_files(paths): + """Autodecodes files and yields their lines.""" + for path in paths: + with open(path, 'rb') as file: + text = auto_decode(file.read()) + yield from text.splitlines() diff --git a/yamllint/linter.py b/yamllint/linter.py index a2faa061..2230a600 100644 --- a/yamllint/linter.py +++ b/yamllint/linter.py @@ -18,7 +18,7 @@ import yaml -from yamllint import parser +from yamllint import decoder, parser PROBLEM_LEVELS = { 0: None, @@ -187,6 +187,8 @@ def get_syntax_error(buffer): def _run(buffer, conf, filepath): assert hasattr(buffer, '__getitem__'), \ '_run() argument must be a buffer, not a stream' + if isinstance(buffer, bytes): + buffer = decoder.auto_decode(buffer) first_line = next(parser.line_generator(buffer)).content if re.match(r'^#\s*yamllint disable-file\s*$', first_line):