diff --git a/setup.py b/setup.py index ffa2ee22..df6cfeb4 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,6 @@ packages=find_packages(exclude=['tests', 'tests.*']), entry_points={'console_scripts': ['yamllint=yamllint.cli:run']}, package_data={'yamllint': ['conf/*.yaml']}, - install_requires=['pathspec >=0.5.3', 'pyyaml'], + install_requires=['pathspec >=0.5.3', 'pyyaml', 'chardet'], test_suite='tests', ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 517bc624..57ee236e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -91,6 +91,16 @@ def setUpClass(cls): # dos line endings yaml 'dos.yml': '---\r\n' 'dos: true', + # UTF-16 Little Endian BOM + 'non-ascii/utf-16-le': + b'\xff\xfe' + u'---\nutf16le: true\n'.encode('utf-16-le'), + # UTF-16 Big Endian + 'non-ascii/utf-16-be': + b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'), + # UTF-8 BOM + 'non-ascii/utf-8': b'\xef\xbb\xbf---\nutf8: true\n', + # Random bytes that have no possible encoding + 'non-ascii/undetectable': b'\x05\xfc\x17A\xb6\x15\x15\x90>9' }) @classmethod @@ -171,6 +181,10 @@ def test_find_files_recursively(self): os.path.join(self.wd, 'dos.yml'), os.path.join(self.wd, 'empty.yml'), os.path.join(self.wd, 'no-yaml.json'), + os.path.join(self.wd, 'non-ascii/undetectable'), + os.path.join(self.wd, 'non-ascii/utf-16-be'), + os.path.join(self.wd, 'non-ascii/utf-16-le'), + os.path.join(self.wd, 'non-ascii/utf-8'), os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'), os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'), os.path.join(self.wd, 'sub/ok.yaml'), @@ -188,6 +202,10 @@ def test_find_files_recursively(self): os.path.join(self.wd, 'dos.yml'), os.path.join(self.wd, 'empty.yml'), os.path.join(self.wd, 'no-yaml.json'), + os.path.join(self.wd, 'non-ascii/undetectable'), + os.path.join(self.wd, 'non-ascii/utf-16-be'), + os.path.join(self.wd, 'non-ascii/utf-16-le'), + os.path.join(self.wd, 'non-ascii/utf-8'), os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'), os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'), os.path.join(self.wd, 'sub/ok.yaml'), @@ -200,7 +218,8 @@ def test_find_files_recursively(self): ' - \'**/utf-8\'\n') self.assertEqual( sorted(cli.find_files_recursively([self.wd], conf)), - [os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8')] + [os.path.join(self.wd, 'non-ascii/utf-8'), + os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8')] ) def test_run_with_bad_arguments(self): @@ -517,3 +536,55 @@ def test_run_non_universal_newline(self): '\n' % path) self.assertEqual( (ctx.returncode, ctx.stdout, ctx.stderr), (1, expected_out, '')) + + def test_encoding_detection_utf16le(self): + path = os.path.join(self.wd, 'non-ascii/utf-16-le') + with RunContext(self) as ctx: + cli.run(('-f', 'parsable', path)) + self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) + + def test_encoding_detection_utf16be(self): + path = os.path.join(self.wd, 'non-ascii/utf-16-be') + with RunContext(self) as ctx: + cli.run(('-f', 'parsable', path)) + self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) + + def test_encoding_detection_utf8(self): + path = os.path.join(self.wd, 'non-ascii/utf-8') + with RunContext(self) as ctx: + cli.run(('-f', 'parsable', path)) + self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) + + def test_detected_encoding_utf8(self): + path = os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8') + with cli.yamlopen(path) as yaml_file: + yaml_file.read() + self.assertEqual(yaml_file.encoding, 'utf-8') + + def test_detected_encoding_utf8_sig(self): + path = os.path.join(self.wd, 'non-ascii/utf-8') + with cli.yamlopen(path) as yaml_file: + yaml_file.read() + self.assertEqual(yaml_file.encoding, 'UTF-8-SIG') + + def test_detected_encoding_utf16(self): + path = os.path.join(self.wd, 'non-ascii/utf-16-le') + with cli.yamlopen(path) as yaml_file: + yaml_file.read() + self.assertEqual(yaml_file.encoding, 'UTF-16') + path = os.path.join(self.wd, 'non-ascii/utf-16-be') + with cli.yamlopen(path) as yaml_file: + yaml_file.read() + self.assertEqual(yaml_file.encoding, 'UTF-16') + + def test_explicit_encoding(self): + path = os.path.join(self.wd, 'a.yaml') + with cli.yamlopen(path, encoding='windows-1252') as yaml_file: + yaml_file.read() + self.assertEqual(yaml_file.encoding, 'windows-1252') + + def test_default_encoding(self): + path = os.path.join(self.wd, 'non-ascii/undetectable') + with cli.yamlopen(path) as yaml_file: + encoding = yaml_file.encoding + self.assertEqual(encoding, 'utf-8') diff --git a/yamllint/cli.py b/yamllint/cli.py index e99fd2ca..0d36d75a 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -17,17 +17,35 @@ from __future__ import print_function import argparse +import contextlib import io import os import platform import sys +import chardet + from yamllint import APP_DESCRIPTION, APP_NAME, APP_VERSION from yamllint import linter from yamllint.config import YamlLintConfig, YamlLintConfigError from yamllint.linter import PROBLEM_LEVELS +@contextlib.contextmanager +def yamlopen(fp, **iowrapper_kwargs): + with io.open(fp, mode='rb') as raw_file: + if iowrapper_kwargs.get('encoding'): + with io.TextIOWrapper(raw_file, **iowrapper_kwargs) as decoded: + yield decoded + else: + raw_data = raw_file.read() + encoding = chardet.detect(raw_data).get('encoding') or 'utf-8' + iowrapper_kwargs['encoding'] = encoding + buff = io.BytesIO(raw_data) + with io.TextIOWrapper(buff, **iowrapper_kwargs) as decoded: + yield decoded + + def find_files_recursively(items, conf): for item in items: if os.path.isdir(item): @@ -177,7 +195,7 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - with io.open(file, newline='') as f: + with yamlopen(file, newline='') as f: problems = linter.run(f, conf, filepath) except EnvironmentError as e: print(e, file=sys.stderr)