From 9f5cf295af685bdcd0561706ea78daff959eae9f Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Tue, 2 Jan 2024 09:35:36 -0500 Subject: [PATCH 1/6] tests: Use correct encoding for path Before this change, build_temp_workspace() would always encode a path using UTF-8 and the strict error handler [1]. Most of the time, this is fine, but systems do not necessarily use UTF-8 and the strict error handler for paths [2]. [1]: [2]: --- tests/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/common.py b/tests/common.py index 29dcfb9c..25b2f6e1 100644 --- a/tests/common.py +++ b/tests/common.py @@ -87,7 +87,7 @@ def build_temp_workspace(files): tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') for path, content in files.items(): - path = os.path.join(tempdir, path).encode('utf-8') + path = os.fsencode(os.path.join(tempdir, path)) if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) From 806d79fe835dad783cc0f6efadaf004228eba2ff Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Wed, 3 Jan 2024 11:50:42 -0500 Subject: [PATCH 2/6] tests: Restore stdout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this commit, test_run_default_format_output_in_tty() changed the value of sys.stdout, but it would never change it back to the original value. This commit makes sure that it gets changed back. At the moment, this commit doesn’t make a user-visible difference. A future commit will add a new test named test_ignored_from_file_with_multiple_encodings(). That new test requires that stdout gets restored, or else it will fail. --- tests/test_cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 308d6181..ad8c71d0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -498,6 +498,7 @@ def test_run_default_format_output_in_tty(self): path = os.path.join(self.wd, 'a.yaml') # Create a pseudo-TTY and redirect stdout to it + old_stdout = sys.stdout master, slave = pty.openpty() sys.stdout = os.fdopen(slave, 'w') @@ -515,6 +516,7 @@ def test_run_default_format_output_in_tty(self): sys.stdout.close() output.close() + sys.stdout = old_stdout self.assertEqual(out, ( f'\033[4m{path}\033[0m\n' From d0fc380f0cf092af1087972cdf8f279c2b13b31a Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Sat, 30 Dec 2023 12:51:24 -0500 Subject: [PATCH 3/6] decoder: Autodetect encoding of YAML files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this change, yamllint would open YAML files using open()’s default encoding. As long as UTF-8 mode isn’t enabled, open() defaults to using the system’s locale encoding [1][2]. This can cause problems in multiple different scenarios. The first scenario involves linting UTF-8 YAML files on Linux systems. Most of the time, the locale encoding on Linux systems is set to UTF-8 [3][4], but it can be set to something else [5]. In the unlikely event that someone was using Linux with a locale encoding other than UTF-8, there was a chance that yamllint would crash with a UnicodeDecodeError. The second scenario involves linting UTF-8 YAML files on Windows systems. The locale encoding on Windows systems is the system’s ANSI code page [6]. The ANSI code page on Windows systems is NOT set to UTF-8 by default [7]. In the very likely event that someone was using Windows with a locale encoding other than UTF-8, there was a chance that yamllint would crash with a UnicodeDecodeError. Additionally, using open()’s default encoding is a violation of the YAML spec. Chapter 5.2 says: “On input, a YAML processor must support the UTF-8 and UTF-16 character encodings. For JSON compatibility, the UTF-32 encodings must also be supported. If a character stream begins with a byte order mark, the character encoding will be taken to be as indicated by the byte order mark. Otherwise, the stream must begin with an ASCII character. This allows the encoding to be deduced by the pattern of null (x00) characters.” [8] This change fixes all of those problems by implementing the YAML spec’s character encoding detection algorithm. Now, as long as YAML files begin with either a byte order mark or an ASCII character, yamllint will automatically detect them as being UTF-8, UTF-16 or UTF-32. Other character encodings are not supported at the moment. It’s possible that this change will break things for existing yamllint users. This change allows users to use the YAMLLINT_FILE_ENCODING to override the autodetection algorithm just in case they’ve been using yamllint on weird nonstandard YAML files. Credit for the idea of having tests with pre-encoded strings goes to @adrienverge [9]. Fixes #218. Fixes #238. Fixes #347. [1]: [2]: [3]: [4]: [5]: [6]: [7]: [8]: [9]: --- docs/character_encoding_override.rst | 32 ++ docs/index.rst | 1 + tests/common.py | 187 +++++++++-- tests/test_cli.py | 60 +++- tests/test_decoder.py | 454 +++++++++++++++++++++++++++ yamllint/cli.py | 2 +- yamllint/config.py | 5 +- yamllint/decoder.py | 79 +++++ yamllint/linter.py | 4 +- 9 files changed, 785 insertions(+), 39 deletions(-) create mode 100644 docs/character_encoding_override.rst create mode 100644 tests/test_decoder.py create mode 100644 yamllint/decoder.py diff --git a/docs/character_encoding_override.rst b/docs/character_encoding_override.rst new file mode 100644 index 00000000..5d5cd4ee --- /dev/null +++ b/docs/character_encoding_override.rst @@ -0,0 +1,32 @@ +Character Encoding Override +=========================== + +When yamllint reads a file, it will try to automatically detect that file’s +character encoding. In order for the automatic detection to work properly, +files must follow these two rules (see `this section of the YAML specification +for details `_): + +* The file must be encoded in UTF-8, UTF-16 or UTF-32. + +* The file must begin with either a byte order mark or an ASCII character. + +Previous versions of yamllint did not try to autodetect the character encoding +of files. Previous versions of yamllint assumed that files used the current +locale’s character encoding. This meant that older versions of yamllint would +sometimes correctly decode files that didn’t follow those two rules. For the +sake of backwards compatibility, the current version of yamllint allows you to +disable automatic character encoding detection by setting the +``YAMLLINT_FILE_ENCODING`` environment variable. If you set the +``YAMLLINT_FILE_ENCODING`` environment variable to the `the name of one of +Python’s standard character encodings +`_, then +yamllint will use that character encoding instead of trying to autodetect the +character encoding. + +The ``YAMLLINT_FILE_ENCODING`` environment variable should only be used as a +stopgap solution. If you need to use ``YAMLLINT_FILE_ENCODING``, then you +should really update your YAML files so that their character encoding can +automatically be detected, or else you may run into compatibility problems. +Future versions of yamllint may remove support for the +``YAMLLINT_FILE_ENCODING`` environment variable, and other YAML processors may +misinterpret your YAML files. diff --git a/docs/index.rst b/docs/index.rst index 5456d6a6..929288f2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -27,3 +27,4 @@ Table of contents development text_editors integration + character_encoding_override diff --git a/tests/common.py b/tests/common.py index 25b2f6e1..1891a198 100644 --- a/tests/common.py +++ b/tests/common.py @@ -1,4 +1,5 @@ # Copyright (C) 2016 Adrien Vergé +# Copyright (C) 2023–2025 Jason Yundt # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -13,6 +14,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import codecs import contextlib from io import StringIO import os @@ -20,6 +22,8 @@ import sys import tempfile import unittest +import warnings +from codecs import CodecInfo import yaml @@ -27,6 +31,155 @@ from yamllint.config import YamlLintConfig +# Encoding related stuff: +UTF_CODECS = ( + 'utf_32_be', + 'utf_32_be_sig', + 'utf_32_le', + 'utf_32_le_sig', + 'utf_16_be', + 'utf_16_be_sig', + 'utf_16_le', + 'utf_16_le_sig', + 'utf_8', + 'utf_8_sig' +) + + +def encode_utf_32_be_sig(obj): + return ( + codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', 'strict'), + len(obj) + ) + + +def encode_utf_32_le_sig(obj): + return ( + codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', 'strict'), + len(obj) + ) + + +def encode_utf_16_be_sig(obj): + return ( + codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', 'strict'), + len(obj) + ) + + +def encode_utf_16_le_sig(obj): + return ( + codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', 'strict'), + len(obj) + ) + + +test_codec_infos = { + 'utf_32_be_sig': + CodecInfo(encode_utf_32_be_sig, codecs.getdecoder('utf_32')), + 'utf_32_le_sig': + CodecInfo(encode_utf_32_le_sig, codecs.getdecoder('utf_32')), + 'utf_16_be_sig': + CodecInfo(encode_utf_16_be_sig, codecs.getdecoder('utf_16')), + 'utf_16_le_sig': + CodecInfo(encode_utf_16_le_sig, codecs.getdecoder('utf_16')), +} + + +def register_test_codecs(): + codecs.register(test_codec_infos.get) + + +def unregister_test_codecs(): + if sys.version_info >= (3, 10, 0): + codecs.unregister(test_codec_infos.get) + else: + warnings.warn( + "This version of Python doesn’t allow us to unregister codecs.", + stacklevel=1 + ) + + +def is_test_codec(codec): + return codec in test_codec_infos.keys() + + +def test_codec_built_in_equivalent(test_codec): + return_value = test_codec + for suffix in ('_sig', '_be', '_le'): + return_value = return_value.replace(suffix, '') + return return_value + + +def uses_bom(codec): + for suffix in ('_32', '_16', '_sig'): + if codec.endswith(suffix): + return True + return False + + +def encoding_detectable(string, codec): + """ + Returns True if encoding can be detected after string is encoded + + Encoding detection only works if you’re using a BOM or the first character + is ASCII. See yamllint.decoder.auto_decode()’s docstring. + """ + return uses_bom(codec) or (len(string) > 0 and string[0].isascii()) + + +# Workspace related stuff: +class Blob: + def __init__(self, text, encoding): + self.text = text + self.encoding = encoding + + +def build_temp_workspace(files): + tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') + + for path, content in files.items(): + path = os.fsencode(os.path.join(tempdir, path)) + if not os.path.exists(os.path.dirname(path)): + os.makedirs(os.path.dirname(path)) + + if isinstance(content, list): + os.mkdir(path) + elif isinstance(content, str) and content.startswith('symlink://'): + os.symlink(content[10:], path) + else: + if isinstance(content, Blob): + content = content.text.encode(content.encoding) + mode = 'wb' if isinstance(content, bytes) else 'w' + with open(path, mode) as f: + f.write(content) + + return tempdir + + +@contextlib.contextmanager +def temp_workspace(files): + """Provide a temporary workspace that is automatically cleaned up.""" + backup_wd = os.getcwd() + wd = build_temp_workspace(files) + + try: + os.chdir(wd) + yield + finally: + os.chdir(backup_wd) + shutil.rmtree(wd) + + +def temp_workspace_with_files_in_many_codecs(path_template, text): + workspace = {} + for codec in UTF_CODECS: + if encoding_detectable(text, codec): + workspace[path_template.format(codec)] = Blob(text, codec) + return workspace + + +# Miscellaneous stuff: class RuleTestCase(unittest.TestCase): def build_fake_config(self, conf): if conf is None: @@ -81,37 +234,3 @@ def __exit__(self, *exc_info): @property def returncode(self): return self._raises_ctx.exception.code - - -def build_temp_workspace(files): - tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') - - for path, content in files.items(): - path = os.fsencode(os.path.join(tempdir, path)) - if not os.path.exists(os.path.dirname(path)): - os.makedirs(os.path.dirname(path)) - - if isinstance(content, list): - os.mkdir(path) - elif isinstance(content, str) and content.startswith('symlink://'): - os.symlink(content[10:], path) - else: - mode = 'wb' if isinstance(content, bytes) else 'w' - with open(path, mode) as f: - f.write(content) - - return tempdir - - -@contextlib.contextmanager -def temp_workspace(files): - """Provide a temporary workspace that is automatically cleaned up.""" - backup_wd = os.getcwd() - wd = build_temp_workspace(files) - - try: - os.chdir(wd) - yield - finally: - os.chdir(backup_wd) - shutil.rmtree(wd) diff --git a/tests/test_cli.py b/tests/test_cli.py index ad8c71d0..d8045d3d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,4 +1,5 @@ # Copyright (C) 2016 Adrien Vergé +# Copyright (C) 2023–2025 Jason Yundt # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -22,7 +23,14 @@ import unittest from io import StringIO -from tests.common import build_temp_workspace, RunContext, temp_workspace +from tests.common import ( + build_temp_workspace, + register_test_codecs, + RunContext, + temp_workspace, + unregister_test_codecs, + temp_workspace_with_files_in_many_codecs, +) from yamllint import cli, config @@ -42,6 +50,7 @@ def setUpModule(): # yamllint uses these environment variables to find a config file. env_vars_that_could_interfere = ( 'YAMLLINT_CONFIG_FILE', + 'YAMLLINT_FILE_ENCODING', 'XDG_CONFIG_HOME', # These variables are used to determine where the user’s home # directory is. See @@ -816,3 +825,52 @@ def test_multiple_parent_config_file(self): self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, './4spaces.yml:2:5: [warning] wrong indentation: ' 'expected 3 but found 4 (indentation)\n', '')) + + +class CommandLineEncodingTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + register_test_codecs() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + unregister_test_codecs() + + def test_valid_encodings(self): + conf = ('---\n' + 'rules:\n' + ' key-ordering: enable\n') + config_files = temp_workspace_with_files_in_many_codecs( + 'config_{}.yaml', + conf + ) + sorted_correctly = ('---\n' + 'A: YAML\n' + 'Z: YAML\n') + sorted_correctly_files = temp_workspace_with_files_in_many_codecs( + 'sorted_correctly/{}.yaml', + sorted_correctly + ) + sorted_incorrectly = ('---\n' + 'Z: YAML\n' + 'A: YAML\n') + sorted_incorrectly_files = temp_workspace_with_files_in_many_codecs( + 'sorted_incorrectly/{}.yaml', + sorted_incorrectly + ) + workspace = { + **config_files, + **sorted_correctly_files, + **sorted_incorrectly_files + } + + with temp_workspace(workspace): + for config_path in config_files.keys(): + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_correctly/')) + self.assertEqual(ctx.returncode, 0) + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_incorrectly/')) + self.assertNotEqual(ctx.returncode, 0) diff --git a/tests/test_decoder.py b/tests/test_decoder.py new file mode 100644 index 00000000..e7723ef8 --- /dev/null +++ b/tests/test_decoder.py @@ -0,0 +1,454 @@ +# Copyright (C) 2023–2025 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs +import os +import unittest + +from tests.common import ( + UTF_CODECS, + encoding_detectable, + is_test_codec, + register_test_codecs, + test_codec_built_in_equivalent, + unregister_test_codecs, + uses_bom, +) + +from yamllint import decoder + + +class PreEncodedTestStringInfo: + def __init__( + self, + input_bytes, + codec_for_input_bytes, + expected_output_str + ): + self.input_bytes = input_bytes + self.codec_for_input_bytes = codec_for_input_bytes + self.expected_output_str = expected_output_str + + +PRE_ENCODED_TEST_STRING_INFOS = ( + # An empty string + PreEncodedTestStringInfo( + b'', + None, + '' + ), + + # A single ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\x00|', + 'utf_32_be', + '|' + ), + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00|', + 'utf_32', + '|' + ), + PreEncodedTestStringInfo( + b'|\x00\x00\x00', + 'utf_32_le', + '|' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00|\x00\x00\x00', + 'utf_32', # LE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'\x00|', + 'utf_16_be', + '|' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00|', + 'utf_16', # BE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'|\x00', + 'utf_16_le', + '|' + ), + PreEncodedTestStringInfo( + b'\xff\xfe|\x00', + 'utf_16', # LE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'|', + 'utf_8', + '|' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf|', + 'utf_8_sig', + '|' + ), + + # A string that starts with an ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501 + 'utf_32_be', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501 + 'utf_32', # BE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32_le', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32', # LE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?', + 'utf_16_be', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?', + 'utf_16', # BE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'W\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00', + 'utf_16_le', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xff\xfeW\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00', + 'utf_16', # LE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'What\xe2\x80\x99s up?', + 'utf_8', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbfWhat\xe2\x80\x99s up?', + 'utf_8_sig', + 'What’s up?' + ), + + # A single non-ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x01\xf4;', + 'utf_32', # BE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00;\xf4\x01\x00', + 'utf_32', # LE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\xd8=\xdc;', + 'utf_16', # BE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xff\xfe=\xd8;\xdc', + 'utf_16', # LE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf\xf0\x9f\x90\xbb', + 'utf_8_sig', + '🐻' + ), + + # A string that starts with a non-ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?', # noqa: E501 + 'utf_32', # BE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32', # LE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00\xc7\x00a\x00 \x00v\x00a\x00?', + 'utf_16', # BE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\xc7\x00a\x00 \x00v\x00a\x00?\x00', + 'utf_16', # LE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf\xc3\x87a va?', + 'utf_8_sig', + 'Ça va?' + ) +) +TEST_STRINGS_TO_ENCODE_AT_RUNTIME = ( + "", + "y", + "yaml", + "🇾⁠🇦⁠🇲⁠🇱⁠❗" +) + + +def setUpModule(): + register_test_codecs() + try: + del os.environ['YAMLLINT_FILE_ENCODING'] + except KeyError: + pass + + +tearDownModule = unregister_test_codecs + + +class EncodingStuffFromCommonTestCase(unittest.TestCase): + def test_test_codecs_and_utf_codecs(self): + error = "{} failed to correctly encode then decode {}." + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + self.assertEqual( + string, + string.encode(codec).decode(codec), + msg=error.format(repr(codec), repr(string)) + ) + + def test_is_test_codec(self): + self.assertFalse(is_test_codec('utf_32')) + self.assertFalse(is_test_codec('utf_32_be')) + self.assertTrue(is_test_codec('utf_32_be_sig')) + self.assertFalse(is_test_codec('utf_32_le')) + self.assertTrue(is_test_codec('utf_32_le_sig')) + + self.assertFalse(is_test_codec('utf_16')) + self.assertFalse(is_test_codec('utf_16_be')) + self.assertTrue(is_test_codec('utf_16_be_sig')) + self.assertFalse(is_test_codec('utf_16_le')) + self.assertTrue(is_test_codec('utf_16_le_sig')) + + self.assertFalse(is_test_codec('utf_8')) + self.assertFalse(is_test_codec('utf_8_be')) + + def test_test_codec_built_in_equivalent(self): + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_be_sig') + ) + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_le_sig') + ) + + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_be_sig') + ) + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_le_sig') + ) + + def test_uses_bom(self): + self.assertTrue(uses_bom('utf_32')) + self.assertFalse(uses_bom('utf_32_be')) + self.assertTrue(uses_bom('utf_32_be_sig')) + self.assertFalse(uses_bom('utf_32_le')) + self.assertTrue(uses_bom('utf_32_le_sig')) + + self.assertTrue(uses_bom('utf_16')) + self.assertFalse(uses_bom('utf_16_be')) + self.assertTrue(uses_bom('utf_16_be_sig')) + self.assertFalse(uses_bom('utf_16_le')) + self.assertTrue(uses_bom('utf_16_le_sig')) + + self.assertFalse(uses_bom('utf_8')) + self.assertTrue(uses_bom('utf_8_sig')) + + def test_encoding_detectable(self): + # No BOM + nothing + self.assertFalse(encoding_detectable('', 'utf_32_be')) + self.assertFalse(encoding_detectable('', 'utf_32_le')) + + self.assertFalse(encoding_detectable('', 'utf_16_be')) + self.assertFalse(encoding_detectable('', 'utf_16_le')) + + self.assertFalse(encoding_detectable('', 'utf_8')) + # BOM + nothing + self.assertTrue(encoding_detectable('', 'utf_32')) + self.assertTrue(encoding_detectable('', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_16')) + self.assertTrue(encoding_detectable('', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_8_sig')) + # No BOM + non-ASCII + self.assertFalse(encoding_detectable('Ⓝⓔ', 'utf_32_be')) + self.assertFalse(encoding_detectable('ⓥⓔ', 'utf_32_le')) + + self.assertFalse(encoding_detectable('ⓡ ', 'utf_16_be')) + self.assertFalse(encoding_detectable('ⓖⓞ', 'utf_16_le')) + + self.assertFalse(encoding_detectable('ⓝⓝ', 'utf_8')) + # No BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32_be')) + self.assertTrue(encoding_detectable('gi', 'utf_32_le')) + + self.assertTrue(encoding_detectable('ve', 'utf_16_be')) + self.assertTrue(encoding_detectable(' y', 'utf_16_le')) + + self.assertTrue(encoding_detectable('ou', 'utf_8')) + # BOM + non-ASCII + self.assertTrue(encoding_detectable('␣ⓤ', 'utf_32')) + self.assertTrue(encoding_detectable('ⓟ␤', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('Ⓝⓔ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('ⓥⓔ', 'utf_16')) + self.assertTrue(encoding_detectable('ⓡ␣', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('ⓖⓞ', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('ⓝⓝ', 'utf_8_sig')) + # BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32')) + self.assertTrue(encoding_detectable('le', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('t ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('yo', 'utf_16')) + self.assertTrue(encoding_detectable('u ', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('do', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('wn', 'utf_8_sig')) + + +class DecoderTestCase(unittest.TestCase): + def detect_encoding_test_helper(self, input_bytes, expected_codec): + ERROR1 = "{} was encoded with {}, but detect_encoding() returned {}." + ERROR2 = "detect_encoding({}) returned a codec that isn’t built-in." + actual_codec = decoder.detect_encoding(input_bytes) + if expected_codec is not None: + self.assertEqual( + expected_codec, + actual_codec, + msg=ERROR1.format( + input_bytes, + repr(expected_codec), + repr(actual_codec) + ) + ) + + codec_info = codecs.lookup(actual_codec) + self.assertFalse( + is_test_codec(codec_info), + msg=ERROR2.format(input_bytes) + ) + + def test_detect_encoding_with_pre_encoded_strings(self): + for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS: + self.detect_encoding_test_helper( + pre_encoded_test_string_info.input_bytes, + pre_encoded_test_string_info.codec_for_input_bytes + ) + + def test_detect_encoding_with_strings_encoded_at_runtime(self): + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + if not uses_bom(codec) and len(string) == 0: + expected_codec = 'utf_8' + elif not encoding_detectable(string, codec): + expected_codec = None + elif is_test_codec(codec): + expected_codec = test_codec_built_in_equivalent(codec) + else: + expected_codec = codec + self.detect_encoding_test_helper( + string.encode(codec), + expected_codec + ) + + def test_detect_encoding_with_env_var_override(self): + # These three encodings were chosen randomly. + NONSTANDARD_ENCODINGS = ('iso8859_6', 'iso8859_11', 'euc_jis_2004') + RANDOM_BYTES = b'\x90Jg\xd9rS\x95\xd6[\x1d\x8b\xc4Ir\x0fC' + for nonstandard_encoding in NONSTANDARD_ENCODINGS: + os.environ['YAMLLINT_FILE_ENCODING'] = nonstandard_encoding + self.assertEqual( + decoder.detect_encoding(RANDOM_BYTES), + nonstandard_encoding + ) + del os.environ['YAMLLINT_FILE_ENCODING'] + + def auto_decode_test_helper( + self, + input_bytes, + codec_for_input_bytes, + expected_string + ): + ERROR = "auto_decode({}) returned the wrong value." + does_auto_detect_encodings_return_value_matter = ( + codec_for_input_bytes is not None and ( + encoding_detectable(expected_string, codec_for_input_bytes) + or len(input_bytes) == 0 + ) + ) + if does_auto_detect_encodings_return_value_matter: + actual_output = decoder.auto_decode(input_bytes) + self.assertEqual( + expected_string, + actual_output, + msg=ERROR.format(repr(input_bytes)) + ) + self.assertIsInstance(actual_output, str) + else: + decoder.auto_decode(input_bytes) + + def test_auto_decode_with_pre_encoded_strings(self): + for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS: + self.auto_decode_test_helper( + pre_encoded_test_string_info.input_bytes, + pre_encoded_test_string_info.codec_for_input_bytes, + pre_encoded_test_string_info.expected_output_str + ) + + def test_auto_decode_with_strings_encoded_at_runtime(self): + at_least_one_decode_error = False + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + try: + self.auto_decode_test_helper( + string.encode(codec), + codec, + string + ) + except UnicodeDecodeError: + at_least_one_decode_error = True + self.assertTrue( + at_least_one_decode_error, + msg=("None of the TEST_STRINGS_TO_ENCODE_AT_RUNTIME triggered a " + "decoding error.") + ) diff --git a/yamllint/cli.py b/yamllint/cli.py index 9a39bd8c..7059b852 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -219,7 +219,7 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - with open(file, newline='') as f: + with open(file, mode='rb') as f: problems = linter.run(f, conf, filepath) except OSError as e: print(e, file=sys.stderr) diff --git a/yamllint/config.py b/yamllint/config.py index 9ce62549..c40d8205 100644 --- a/yamllint/config.py +++ b/yamllint/config.py @@ -20,6 +20,7 @@ import yaml import yamllint.rules +from yamllint import decoder class YamlLintConfigError(Exception): @@ -38,8 +39,8 @@ def __init__(self, content=None, file=None): self.locale = None if file is not None: - with open(file) as f: - content = f.read() + with open(file, mode='rb') as f: + content = decoder.auto_decode(f.read()) self.parse(content) self.validate() diff --git a/yamllint/decoder.py b/yamllint/decoder.py new file mode 100644 index 00000000..552b2899 --- /dev/null +++ b/yamllint/decoder.py @@ -0,0 +1,79 @@ +# Copyright (C) 2023–2025 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs +import os +import warnings + + +def detect_encoding(stream_data): + """ + Return stream_data’s character encoding + + Specifically, this function will take a bytes object and return a string + that contains the name of one of Python’s built-in codecs [1]. + + The YAML spec says that streams must begin with a BOM or an ASCII + character. If stream_data doesn’t begin with either of those, then this + function might return the wrong encoding. See chapter 5.2 of the YAML spec + for details [2]. + + Before this function was added, yamllint would sometimes decode text files + using a non-standard character encoding. It’s possible that there are users + out there who still want to use yamllint with non-standard character + encodings, so this function includes an override switch for those users. If + the YAMLLINT_FILE_ENCODING environment variable is set to "example_codec", + then this function will always return "example_codec". + + [1]: + [2]: + """ + if 'YAMLLINT_FILE_ENCODING' in os.environ: + warnings.warn("YAMLLINT_FILE_ENCODING is meant for temporary " + "workarounds. It may be removed in a future version of " + "yamllint.") + return os.environ['YAMLLINT_FILE_ENCODING'] + elif stream_data.startswith(codecs.BOM_UTF32_BE): + return 'utf_32' + elif stream_data.startswith(b'\x00\x00\x00') and len(stream_data) >= 4: + return 'utf_32_be' + elif stream_data.startswith(codecs.BOM_UTF32_LE): + return 'utf_32' + elif stream_data[1:4] == b'\x00\x00\x00': + return 'utf_32_le' + elif stream_data.startswith(codecs.BOM_UTF16_BE): + return 'utf_16' + elif stream_data.startswith(b'\x00') and len(stream_data) >= 2: + return 'utf_16_be' + elif stream_data.startswith(codecs.BOM_UTF16_LE): + return 'utf_16' + elif stream_data[1:2] == b'\x00': + return 'utf_16_le' + elif stream_data.startswith(codecs.BOM_UTF8): + return 'utf_8_sig' + else: + return 'utf_8' + + +def auto_decode(stream_data): + return stream_data.decode(encoding=detect_encoding(stream_data)) + + +def lines_in_files(paths): + """Autodecodes files and yields their lines.""" + for path in paths: + with open(path, 'rb') as file: + text = auto_decode(file.read()) + yield from text.splitlines() diff --git a/yamllint/linter.py b/yamllint/linter.py index a2faa061..2230a600 100644 --- a/yamllint/linter.py +++ b/yamllint/linter.py @@ -18,7 +18,7 @@ import yaml -from yamllint import parser +from yamllint import decoder, parser PROBLEM_LEVELS = { 0: None, @@ -187,6 +187,8 @@ def get_syntax_error(buffer): def _run(buffer, conf, filepath): assert hasattr(buffer, '__getitem__'), \ '_run() argument must be a buffer, not a stream' + if isinstance(buffer, bytes): + buffer = decoder.auto_decode(buffer) first_line = next(parser.line_generator(buffer)).content if re.match(r'^#\s*yamllint disable-file\s*$', first_line): From 65d1347212c1ea169ffab77d494a9c94f8fea5fd Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Sun, 31 Dec 2023 18:10:38 -0500 Subject: [PATCH 4/6] decoder: Autodetect encoding for ignore-from-file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this change, yamllint would decode files on the ignore-from-file list using open()’s default encoding [1][2]. This can cause decoding to fail in some situations (see the previous commit message for details). This change makes yamllint automatically detect the encoding for files on the ignore-from-file list. It uses the same algorithm that it uses for detecting the encoding of YAML files, so the same limitations apply: files must use UTF-8, UTF-16 or UTF-32 and they must begin with either a byte order mark or an ASCII character. [1]: [2]: --- docs/configuration.rst | 15 +++++++++++++ tests/test_config.py | 50 +++++++++++++++++++++++++++++++++++++++++- tests/test_decoder.py | 30 +++++++++++++++++++++++++ yamllint/config.py | 14 +++++++----- 4 files changed, 102 insertions(+), 7 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 9624b496..1d0fb8c5 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -228,6 +228,21 @@ or: .. note:: However, this is mutually exclusive with the ``ignore`` key. +.. note:: Files on the ``ignore-from-file`` list should use either UTF-8, + UTF-16 or UTF-32. Additionally, they should start with either an ASCII + character or a byte order mark. + + If you have an ignore file that doesn’t follow those two rules, then you can + set the ``YAMLLINT_FILE_ENCODING`` environment variable to the name of the + character encoding that you want yamllint to use for ignore files. + Specifically, ``YAMLLINT_FILE_ENCODING`` should be set to `the name of one + of Python’s standard character encodings + `_. Please + note, this should only be used as a temporary solution in order to make it + easier to migrate from older versions of yamllint to newer versions of + yamllint. See :doc:`Character Encoding Override + ` for details. + If you need to know the exact list of files that yamllint would process, without really linting them, you can use ``--list-files``: diff --git a/tests/test_config.py b/tests/test_config.py index fb570c66..174c0121 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,4 +1,5 @@ # Copyright (C) 2016 Adrien Vergé +# Copyright (C) 2023–2025 Jason Yundt # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -13,6 +14,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import itertools import os import shutil import sys @@ -20,7 +22,12 @@ import unittest from io import StringIO -from tests.common import build_temp_workspace, RunContext +from tests.common import ( + build_temp_workspace, + register_test_codecs, + RunContext, + unregister_test_codecs, +) from yamllint import cli, config from yamllint.config import YamlLintConfigError @@ -820,3 +827,44 @@ def test_run_with_ignore_on_ignored_file(self): sys.stdout.getvalue().strip(), 'file-at-root.yaml:4:17: [error] trailing spaces (trailing-spaces)' ) + + def create_ignore_file(self, text, codec): + path = os.path.join(self.wd, f'{codec}.ignore') + with open(path, 'wb') as f: + f.write(text.encode(codec)) + self.addCleanup(lambda: os.remove(path)) + return path + + def test_ignored_from_file_with_multiple_encodings(self): + register_test_codecs() + self.addCleanup(unregister_test_codecs) + + ignore_files = itertools.starmap( + self.create_ignore_file, ( + ('bin/file.lint-me-anyway.yaml\n', 'utf_32_be'), + ('bin/file.yaml\n', 'utf_32_be_sig'), + ('file-at-root.yaml\n', 'utf_32_le'), + ('file.dont-lint-me.yaml\n', 'utf_32_le_sig'), + + ('ign-dup/file.yaml\n', 'utf_16_be'), + ('ign-dup/sub/dir/file.yaml\n', 'utf_16_be_sig'), + ('ign-trail/file.yaml\n', 'utf_16_le'), + ('include/ign-dup/sub/dir/file.yaml\n', 'utf_16_le_sig'), + + ('s/s/ign-trail/file.yaml\n', 'utf_8'), + ( + 's/s/ign-trail/s/s/file.yaml\n' + 's/s/ign-trail/s/s/file2.lint-me-anyway.yaml\n' + '.yamllint\n', + + 'utf_8_sig' + ), + ) + ) + conf = ('---\n' + 'extends: default\n' + f'ignore-from-file: [{", ".join(ignore_files)}]\n') + + with self.assertRaises(SystemExit) as cm: + cli.run(('-d', conf, '.')) + self.assertEqual(cm.exception.code, 0) diff --git a/tests/test_decoder.py b/tests/test_decoder.py index e7723ef8..7daa1dd3 100644 --- a/tests/test_decoder.py +++ b/tests/test_decoder.py @@ -14,6 +14,7 @@ # along with this program. If not, see . import codecs +import itertools import os import unittest @@ -22,6 +23,8 @@ encoding_detectable, is_test_codec, register_test_codecs, + temp_workspace, + temp_workspace_with_files_in_many_codecs, test_codec_built_in_equivalent, unregister_test_codecs, uses_bom, @@ -452,3 +455,30 @@ def test_auto_decode_with_strings_encoded_at_runtime(self): msg=("None of the TEST_STRINGS_TO_ENCODE_AT_RUNTIME triggered a " "decoding error.") ) + + def perform_lines_in_file_test(self, strings): + workspace = temp_workspace_with_files_in_many_codecs( + '{}', + '\n'.join(strings) + ) + with temp_workspace(workspace): + iterable = zip( + itertools.cycle(strings), + decoder.lines_in_files(workspace.keys()) + ) + for item in iterable: + self.assertEqual(item[0], item[1]) + + def test_lines_in_file(self): + self.perform_lines_in_file_test(( + "YAML", + "ⓎⒶⓂⓁ", + "🅨🅐🅜🅛", + "YAML" + )) + self.perform_lines_in_file_test(( + "𝐘𝐀𝐌𝐋", + "𝖄𝕬𝕸𝕷", + "𝒀𝑨𝑴𝑳", + "𝓨𝓐𝓜𝓛" + )) diff --git a/yamllint/config.py b/yamllint/config.py index c40d8205..b7d389fc 100644 --- a/yamllint/config.py +++ b/yamllint/config.py @@ -13,7 +13,6 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import fileinput import os.path import pathspec @@ -110,8 +109,10 @@ def parse(self, raw_content): raise YamlLintConfigError( 'invalid config: ignore-from-file should contain ' 'filename(s), either as a list or string') - with fileinput.input(conf['ignore-from-file']) as f: - self.ignore = pathspec.PathSpec.from_lines('gitwildmatch', f) + self.ignore = pathspec.PathSpec.from_lines( + 'gitwildmatch', + decoder.lines_in_files(conf['ignore-from-file']) + ) elif 'ignore' in conf: if isinstance(conf['ignore'], str): self.ignore = pathspec.PathSpec.from_lines( @@ -164,9 +165,10 @@ def validate_rule_conf(rule, conf): raise YamlLintConfigError( 'invalid config: ignore-from-file should contain ' 'valid filename(s), either as a list or string') - with fileinput.input(conf['ignore-from-file']) as f: - conf['ignore'] = pathspec.PathSpec.from_lines( - 'gitwildmatch', f) + conf['ignore'] = pathspec.PathSpec.from_lines( + 'gitwildmatch', + decoder.lines_in_files(conf['ignore-from-file']) + ) elif ('ignore' in conf and not isinstance( conf['ignore'], pathspec.pathspec.PathSpec)): if isinstance(conf['ignore'], str): From 7a7fa27c7a86fdfd5f9431261cfc76e5c9a39533 Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Sun, 31 Dec 2023 17:36:48 -0500 Subject: [PATCH 5/6] =?UTF-8?q?tests:=20Stop=20using=20open()=E2=80=99s=20?= =?UTF-8?q?default=20encoding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In general, using open()’s default encoding is a mistake [1]. This change makes sure that every time open() is called, the encoding parameter is specified. Specifically, it makes it so that all tests succeed when run like this: python -X warn_default_encoding -W error::EncodingWarning -m unittest discover [1]: --- tests/common.py | 5 +++-- tests/test_cli.py | 23 +++++++++++++-------- tests/test_config.py | 49 ++++++++++++++++++++++++++------------------ tests/test_module.py | 6 ++++-- 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/tests/common.py b/tests/common.py index 1891a198..367bdd82 100644 --- a/tests/common.py +++ b/tests/common.py @@ -150,8 +150,9 @@ def build_temp_workspace(files): else: if isinstance(content, Blob): content = content.text.encode(content.encoding) - mode = 'wb' if isinstance(content, bytes) else 'w' - with open(path, mode) as f: + elif isinstance(content, str): + content = content.encode('utf_8') + with open(path, 'wb') as f: f.write(content) return tempdir diff --git a/tests/test_cli.py b/tests/test_cli.py index d8045d3d..ab80f2b8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -304,14 +304,14 @@ def test_run_with_implicit_extends_config(self): (ctx.returncode, ctx.stdout, ctx.stderr), (0, expected_out, '')) def test_run_with_config_file(self): - with open(os.path.join(self.wd, 'config'), 'w') as f: + with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: disable}') with RunContext(self) as ctx: cli.run(('-c', f.name, os.path.join(self.wd, 'a.yaml'))) self.assertEqual(ctx.returncode, 0) - with open(os.path.join(self.wd, 'config'), 'w') as f: + with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: enable}') with RunContext(self) as ctx: @@ -327,14 +327,14 @@ def test_run_with_user_global_config_file(self): self.addCleanup(os.environ.__delitem__, 'HOME') os.environ['HOME'] = home - with open(config, 'w') as f: + with open(config, 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: disable}') with RunContext(self) as ctx: cli.run((os.path.join(self.wd, 'a.yaml'), )) self.assertEqual(ctx.returncode, 0) - with open(config, 'w') as f: + with open(config, 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: enable}') with RunContext(self) as ctx: @@ -347,7 +347,8 @@ def test_run_with_user_xdg_config_home_in_env(self): with tempfile.TemporaryDirectory('w') as d: os.environ['XDG_CONFIG_HOME'] = d os.makedirs(os.path.join(d, 'yamllint')) - with open(os.path.join(d, 'yamllint', 'config'), 'w') as f: + path = os.path.join(d, 'yamllint', 'config') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: relaxed') with RunContext(self) as ctx: cli.run(('-f', 'parsable', os.path.join(self.wd, 'warn.yaml'))) @@ -357,7 +358,7 @@ def test_run_with_user_xdg_config_home_in_env(self): def test_run_with_user_yamllint_config_file_in_env(self): self.addCleanup(os.environ.__delitem__, 'YAMLLINT_CONFIG_FILE') - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: os.environ['YAMLLINT_CONFIG_FILE'] = f.name f.write('rules: {trailing-spaces: disable}') f.flush() @@ -365,7 +366,7 @@ def test_run_with_user_yamllint_config_file_in_env(self): cli.run((os.path.join(self.wd, 'a.yaml'), )) self.assertEqual(ctx.returncode, 0) - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: os.environ['YAMLLINT_CONFIG_FILE'] = f.name f.write('rules: {trailing-spaces: enable}') f.flush() @@ -509,7 +510,11 @@ def test_run_default_format_output_in_tty(self): # Create a pseudo-TTY and redirect stdout to it old_stdout = sys.stdout master, slave = pty.openpty() - sys.stdout = os.fdopen(slave, 'w') + sys.stdout = os.fdopen( + slave, + 'w', + encoding=os.device_encoding(slave) + ) with self.assertRaises(SystemExit) as ctx: cli.run((path, )) @@ -518,7 +523,7 @@ def test_run_default_format_output_in_tty(self): self.assertEqual(ctx.exception.code, 1) # Read output from TTY - output = os.fdopen(master, 'r') + output = os.fdopen(master, 'r', encoding=os.device_encoding(master)) os.set_blocking(master, False) out = output.read().replace('\r\n', '\n') diff --git a/tests/test_config.py b/tests/test_config.py index 174c0121..05b7cac0 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -259,7 +259,7 @@ def test_extend_on_object(self): self.assertEqual(len(new.enabled_rules(None)), 2) def test_extend_on_file(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -278,7 +278,7 @@ def test_extend_on_file(self): self.assertEqual(len(c.enabled_rules(None)), 2) def test_extend_remove_rule(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -297,7 +297,7 @@ def test_extend_remove_rule(self): self.assertEqual(len(c.enabled_rules(None)), 1) def test_extend_edit_rule(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -319,7 +319,7 @@ def test_extend_edit_rule(self): self.assertEqual(len(c.enabled_rules(None)), 2) def test_extend_reenable_rule(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -339,7 +339,7 @@ def test_extend_reenable_rule(self): self.assertEqual(len(c.enabled_rules(None)), 2) def test_extend_recursive_default_values(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' braces:\n' ' max-spaces-inside: 1248\n') @@ -354,7 +354,7 @@ def test_extend_recursive_default_values(self): self.assertEqual(c.rules['braces']['min-spaces-inside-empty'], 2357) self.assertEqual(c.rules['braces']['max-spaces-inside-empty'], -1) - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 1337\n') @@ -366,8 +366,8 @@ def test_extend_recursive_default_values(self): self.assertEqual(c.rules['colons']['max-spaces-before'], 1337) self.assertEqual(c.rules['colons']['max-spaces-after'], 1) - with tempfile.NamedTemporaryFile('w') as f1, \ - tempfile.NamedTemporaryFile('w') as f2: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f1, \ + tempfile.NamedTemporaryFile('w', encoding='utf_8') as f2: f1.write('rules:\n' ' colons:\n' ' max-spaces-before: 1337\n') @@ -384,7 +384,7 @@ def test_extend_recursive_default_values(self): self.assertEqual(c.rules['colons']['max-spaces-after'], 1) def test_extended_ignore_str(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('ignore: |\n' ' *.template.yaml\n') f.flush() @@ -394,7 +394,7 @@ def test_extended_ignore_str(self): self.assertEqual(c.ignore.match_file('test.yaml'), False) def test_extended_ignore_list(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('ignore:\n' ' - "*.template.yaml"\n') f.flush() @@ -564,7 +564,8 @@ def test_no_ignore(self): ))) def test_run_with_ignore_str(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore: |\n' ' *.dont-lint-me.yaml\n' @@ -618,7 +619,8 @@ def test_run_with_ignore_str(self): ))) def test_run_with_ignore_list(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore:\n' ' - "*.dont-lint-me.yaml"\n' @@ -672,19 +674,22 @@ def test_run_with_ignore_list(self): ))) def test_run_with_ignore_from_file(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore-from-file: .gitignore\n' 'rules:\n' ' key-duplicates:\n' ' ignore-from-file: .ignore-key-duplicates\n') - with open(os.path.join(self.wd, '.gitignore'), 'w') as f: + path = os.path.join(self.wd, '.gitignore') + with open(path, 'w', encoding='utf_8') as f: f.write('*.dont-lint-me.yaml\n' '/bin/\n' '!/bin/*.lint-me-anyway.yaml\n') - with open(os.path.join(self.wd, '.ignore-key-duplicates'), 'w') as f: + path = os.path.join(self.wd, '.ignore-key-duplicates') + with open(path, 'w', encoding='utf_8') as f: f.write('/ign-dup\n') sys.stdout = StringIO() @@ -729,13 +734,16 @@ def test_run_with_ignore_from_file(self): ))) def test_run_with_ignored_from_file(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('ignore-from-file: [.gitignore, .yamlignore]\n' 'extends: default\n') - with open(os.path.join(self.wd, '.gitignore'), 'w') as f: + path = os.path.join(self.wd, '.gitignore') + with open(path, 'w', encoding='utf_8') as f: f.write('*.dont-lint-me.yaml\n' '/bin/\n') - with open(os.path.join(self.wd, '.yamlignore'), 'w') as f: + path = os.path.join(self.wd, '.yamlignore') + with open(path, 'w', encoding='utf_8') as f: f.write('!/bin/*.lint-me-anyway.yaml\n') sys.stdout = StringIO() @@ -794,7 +802,7 @@ def test_run_with_ignore_with_broken_symlink(self): cli.run(('-f', 'parsable', '.')) self.assertNotEqual(ctx.returncode, 0) - with open(os.path.join(wd, '.yamllint'), 'w') as f: + with open(os.path.join(wd, '.yamllint'), 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore: |\n' ' *404.yaml\n') @@ -812,7 +820,8 @@ def test_run_with_ignore_with_broken_symlink(self): shutil.rmtree(wd) def test_run_with_ignore_on_ignored_file(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('ignore: file.dont-lint-me.yaml\n' 'rules:\n' ' trailing-spaces: enable\n' diff --git a/tests/test_module.py b/tests/test_module.py index 7f4f62ba..b4e24e38 100644 --- a/tests/test_module.py +++ b/tests/test_module.py @@ -28,12 +28,14 @@ def setUp(self): self.wd = tempfile.mkdtemp(prefix='yamllint-tests-') # file with only one warning - with open(os.path.join(self.wd, 'warn.yaml'), 'w') as f: + path = os.path.join(self.wd, 'warn.yaml') + with open(path, 'w', encoding='utf_8') as f: f.write('key: value\n') # file in dir os.mkdir(os.path.join(self.wd, 'sub')) - with open(os.path.join(self.wd, 'sub', 'nok.yaml'), 'w') as f: + path = os.path.join(self.wd, 'sub', 'nok.yaml') + with open(path, 'w', encoding='utf_8') as f: f.write('---\n' 'list: [ 1, 1, 2, 3, 5, 8] \n') From e14f4017931584aeeaaa42b07ecd01f9017002f5 Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Wed, 3 Jan 2024 12:03:13 -0500 Subject: [PATCH 6/6] =?UTF-8?q?CI:=20Fail=20when=20open()=E2=80=99s=20defa?= =?UTF-8?q?ult=20encoding=20is=20used?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous few commits have removed all calls to open() that use its default encoding. That being said, it’s still possible that code added in the future will contain that same mistake. This commit makes it so that the CI test job will fail if that mistake is made again. Unfortunately, it doesn’t look like coverage.py allows you to specify -X options [1] or warning filters [2] when running your tests [3]. To work around this problem, I’m running all of the Python code, including coverage.py itself, with -X warn_default_encoding and -W error::EncodingWarning. As a result, the CI test job will also fail if coverage.py uses open()’s default encoding. Hopefully, coverage.py won’t do that. If it does, then we can always temporarily revert this commit. [1]: [2]: [3]: --- .github/workflows/ci.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 050c64c4..689a2f7e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -57,6 +57,13 @@ jobs: - run: pip install . # https://github.com/AndreMiras/coveralls-python-action/issues/18 - run: echo -e "[run]\nrelative_files = True" > .coveragerc - - run: coverage run -m unittest discover + - run: >- + python + -X warn_default_encoding + -W error::EncodingWarning + -m coverage + run + -m unittest + discover - name: Coveralls uses: AndreMiras/coveralls-python-action@develop