Skip to content

Commit

Permalink
decoder: Autodetect detect encoding of YAML files
Browse files Browse the repository at this point in the history
Before this change, yamllint would open YAML files using open()’s
default encoding. As long as UTF-8 mode isn’t enabled, open() defaults
to using the system’s locale encoding [1][2]. This can cause problems in
multiple different scenarios.

The first scenario involves linting UTF-8 YAML files on Linux systems.
Most of the time, the locale encoding on Linux systems is set to UTF-8
[3][4], but it can be set to something else [5]. In the unlikely event
that someone was using Linux with a locale encoding other than UTF-8,
there was a chance that yamllint would crash with a UnicodeDecodeError.

The second scenario involves linting UTF-8 YAML files on Windows
systems. The locale encoding on Windows systems is the system’s ANSI
code page [6]. The ANSI code page on Windows systems is NOT set to UTF-8
by default [7]. In the very likely event that someone was using Windows
with a locale encoding other than UTF-8, there was a chance that
yamllint would crash with a UnicodeDecodeError.

Additionally, using open()’s default encoding is a violation of the YAML
spec. Chapter 5.2 says:

	“On input, a YAML processor must support the UTF-8 and UTF-16
	character encodings. For JSON compatibility, the UTF-32
	encodings must also be supported.

	If a character stream begins with a byte order mark, the
	character encoding will be taken to be as indicated by the byte
	order mark. Otherwise, the stream must begin with an ASCII
	character. This allows the encoding to be deduced by the pattern
	of null (x00) characters.” [8]

This change fixes all of those problems by implementing the YAML spec’s
character encoding detection algorithm. Now, as long as YAML files
begin with either a byte order mark or an ASCII character, yamllint
will automatically detect them as being UTF-8, UTF-16 or UTF-32. Other
character encodings are not supported at the moment.

Credit for the idea of having tests with pre-encoded strings goes to
@adrienverge [9].

Fixes adrienverge#218. Fixes adrienverge#238. Fixes adrienverge#347.

[1]: <https://docs.python.org/3.12/library/functions.html#open>
[2]: <https://docs.python.org/3.12/library/os.html#utf8-mode>
[3]: <https://www.gnu.org/software/libc/manual/html_node/Extended-Char-Intro.html>
[4]: <https://wiki.musl-libc.org/functional-differences-from-glibc.html#Character-sets-and-locale>
[5]: <https://sourceware.org/git/?p=glibc.git;a=blob;f=localedata/SUPPORTED;h=c8b63cc2fe2b4547f2fb1bff6193da68d70bd563;hb=36f2487f13e3540be9ee0fb51876b1da72176d3f>
[6]: <https://docs.python.org/3.12/glossary.html#term-locale-encoding>
[7]: <https://learn.microsoft.com/en-us/windows/apps/design/globalizing/use-utf8-code-page>
[8]: <https://yaml.org/spec/1.2.2/#52-character-encodings>
[9]: <adrienverge#630 (comment)>
  • Loading branch information
Jayman2000 committed Nov 29, 2024
1 parent 4881789 commit e5ef039
Show file tree
Hide file tree
Showing 7 changed files with 729 additions and 39 deletions.
182 changes: 148 additions & 34 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,168 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import codecs
import contextlib
from io import StringIO
import os
import shutil
import sys
import tempfile
import unittest
import warnings
from codecs import CodecInfo

import yaml

from yamllint import linter
from yamllint.config import YamlLintConfig


# Encoding related stuff:
UTF_CODECS = (
'utf_32_be',
'utf_32_be_sig',
'utf_32_le',
'utf_32_le_sig',
'utf_16_be',
'utf_16_be_sig',
'utf_16_le',
'utf_16_le_sig',
'utf_8',
'utf_8_sig'
)


def encode_utf_32_be_sig(obj, errors='strict'):
return (
codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors),
len(obj)
)


def encode_utf_32_le_sig(obj, errors='strict'):
return (
codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors),
len(obj)
)


def encode_utf_16_be_sig(obj, errors='strict'):
return (
codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors),
len(obj)
)


def encode_utf_16_le_sig(obj, errors='strict'):
return (
codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors),
len(obj)
)


test_codec_infos = {
'utf_32_be_sig': CodecInfo(encode_utf_32_be_sig, codecs.getdecoder('utf_32')), # noqa: E501
'utf_32_le_sig': CodecInfo(encode_utf_32_le_sig, codecs.getdecoder('utf_32')), # noqa: E501
'utf_16_be_sig': CodecInfo(encode_utf_16_be_sig, codecs.getdecoder('utf_16')), # noqa: E501
'utf_16_le_sig': CodecInfo(encode_utf_16_le_sig, codecs.getdecoder('utf_16')), # noqa: E501
}


def register_test_codecs():
codecs.register(test_codec_infos.get)


def unregister_test_codecs():
if sys.version_info >= (3, 10, 0):
codecs.unregister(test_codec_infos.get)
else:
warnings.warn(
"This version of Python doesn’t allow us to unregister codecs.",
stacklevel=1
)


def is_test_codec(codec):
return codec in test_codec_infos.keys()


def test_codec_built_in_equivalent(test_codec):
return_value = test_codec
for suffix in ('_sig', '_be', '_le'):
return_value = return_value.replace(suffix, '')
return return_value


def uses_bom(codec):
for suffix in ('_32', '_16', '_sig'):
if codec.endswith(suffix):
return True
return False


def encoding_detectable(string, codec):
"""
Returns True if encoding can be detected after string is encoded
Encoding detection only works if you’re using a BOM or the first character
is ASCII. See yamllint.decoder.auto_decode()’s docstring.
"""
return uses_bom(codec) or (len(string) > 0 and string[0].isascii())


# Workspace related stuff:
class Blob:
def __init__(self, text, encoding):
self.text = text
self.encoding = encoding


def build_temp_workspace(files):
tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')

for path, content in files.items():
path = os.fsencode(os.path.join(tempdir, path))
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))

if isinstance(content, list):
os.mkdir(path)
elif isinstance(content, str) and content.startswith('symlink://'):
os.symlink(content[10:], path)
else:
if isinstance(content, Blob):
content = content.text.encode(content.encoding)
mode = 'wb' if isinstance(content, bytes) else 'w'
with open(path, mode) as f:
f.write(content)

return tempdir


@contextlib.contextmanager
def temp_workspace(files):
"""Provide a temporary workspace that is automatically cleaned up."""
backup_wd = os.getcwd()
wd = build_temp_workspace(files)

try:
os.chdir(wd)
yield
finally:
os.chdir(backup_wd)
shutil.rmtree(wd)


def temp_workspace_with_files_in_many_codecs(path_template, text):
workspace = {}
for codec in UTF_CODECS:
if encoding_detectable(text, codec):
workspace[path_template.format(codec)] = Blob(text, codec)
return workspace


# Miscellaneous stuff:
class RuleTestCase(unittest.TestCase):
def build_fake_config(self, conf):
if conf is None:
Expand Down Expand Up @@ -81,37 +229,3 @@ def __exit__(self, *exc_info):
@property
def returncode(self):
return self._raises_ctx.exception.code


def build_temp_workspace(files):
tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')

for path, content in files.items():
path = os.fsencode(os.path.join(tempdir, path))
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))

if isinstance(content, list):
os.mkdir(path)
elif isinstance(content, str) and content.startswith('symlink://'):
os.symlink(content[10:], path)
else:
mode = 'wb' if isinstance(content, bytes) else 'w'
with open(path, mode) as f:
f.write(content)

return tempdir


@contextlib.contextmanager
def temp_workspace(files):
"""Provide a temporary workspace that is automatically cleaned up."""
backup_wd = os.getcwd()
wd = build_temp_workspace(files)

try:
os.chdir(wd)
yield
finally:
os.chdir(backup_wd)
shutil.rmtree(wd)
58 changes: 57 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,14 @@
import unittest
from io import StringIO

from tests.common import build_temp_workspace, RunContext, temp_workspace
from tests.common import (
build_temp_workspace,
register_test_codecs,
RunContext,
temp_workspace,
unregister_test_codecs,
temp_workspace_with_files_in_many_codecs,
)

from yamllint import cli, config

Expand Down Expand Up @@ -819,3 +826,52 @@ def test_multiple_parent_config_file(self):
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr),
(0, './4spaces.yml:2:5: [warning] wrong indentation: '
'expected 3 but found 4 (indentation)\n', ''))


class CommandLineEncodingTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
super().setUpClass()
register_test_codecs()

@classmethod
def tearDownClass(cls):
super().tearDownClass()
unregister_test_codecs()

def test_valid_encodings(self):
conf = ('---\n'
'rules:\n'
' key-ordering: enable\n')
config_files = temp_workspace_with_files_in_many_codecs(
'config_{}.yaml',
conf
)
sorted_correctly = ('---\n'
'A: YAML\n'
'Z: YAML\n')
sorted_correctly_files = temp_workspace_with_files_in_many_codecs(
'sorted_correctly/{}.yaml',
sorted_correctly
)
sorted_incorrectly = ('---\n'
'Z: YAML\n'
'A: YAML\n')
sorted_incorrectly_files = temp_workspace_with_files_in_many_codecs(
'sorted_incorrectly/{}.yaml',
sorted_incorrectly
)
workspace = {
**config_files,
**sorted_correctly_files,
**sorted_incorrectly_files
}

with temp_workspace(workspace):
for config_path in config_files.keys():
with RunContext(self) as ctx:
cli.run(('-c', config_path, 'sorted_correctly/'))
self.assertEqual(ctx.returncode, 0)
with RunContext(self) as ctx:
cli.run(('-c', config_path, 'sorted_incorrectly/'))
self.assertNotEqual(ctx.returncode, 0)
Loading

0 comments on commit e5ef039

Please sign in to comment.