From cde58967e6b0a5ac212f5dc48629af4b9ccc6d60 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Tue, 24 Mar 2020 21:23:13 -0700 Subject: [PATCH 01/14] use codecs --- yamllint/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yamllint/cli.py b/yamllint/cli.py index e99fd2ca..9165452c 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -17,7 +17,7 @@ from __future__ import print_function import argparse -import io +import codecs import os import platform import sys @@ -177,7 +177,7 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - with io.open(file, newline='') as f: + with codecs.open(file) as f: problems = linter.run(f, conf, filepath) except EnvironmentError as e: print(e, file=sys.stderr) From 11fa9edfb85a378dacceba42bf30c2607a2e8574 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Tue, 24 Mar 2020 21:51:21 -0700 Subject: [PATCH 02/14] determine encoding --- yamllint/cli.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/yamllint/cli.py b/yamllint/cli.py index 9165452c..96e18925 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -18,6 +18,7 @@ import argparse import codecs +import io import os import platform import sys @@ -28,6 +29,17 @@ from yamllint.linter import PROBLEM_LEVELS +def determine_encoding(file): + with io.open(file, 'rb') as raw_file: + data = raw_file.read() + if data.startswith(codecs.BOM_UTF16_LE): + encoding = 'utf-16-le' + elif data.startswith(codecs.BOM_UTF16_BE): + encoding = 'utf-16-be' + else: + encoding = 'utf-8' + return encoding + def find_files_recursively(items, conf): for item in items: if os.path.isdir(item): @@ -177,7 +189,8 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - with codecs.open(file) as f: + encoding = determine_encoding(file) + with io.open(file, newline='', encoding=encoding) as f: problems = linter.run(f, conf, filepath) except EnvironmentError as e: print(e, file=sys.stderr) From d3cd29edb68081ff1cf6d6ecb1322a1573d92186 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Tue, 24 Mar 2020 21:51:44 -0700 Subject: [PATCH 03/14] add newline --- yamllint/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yamllint/cli.py b/yamllint/cli.py index 96e18925..d3144812 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -40,6 +40,7 @@ def determine_encoding(file): encoding = 'utf-8' return encoding + def find_files_recursively(items, conf): for item in items: if os.path.isdir(item): From 10daf4c45f9553ebe86b32d76446d2b532520c4b Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Tue, 24 Mar 2020 22:12:22 -0700 Subject: [PATCH 04/14] add files with boms to test dir --- tests/test_cli.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 517bc624..1c497fe5 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -91,6 +91,10 @@ def setUpClass(cls): # dos line endings yaml 'dos.yml': '---\r\n' 'dos: true', + # UTF-16 BOM + 'non-ascii/utf16': b'\xff\xfe---\nutf16: true\n', + #UTF-8 BOM + 'non-ascii/utf8': b'\xef\xbb\xbf---\nutf8: true\n', }) @classmethod @@ -171,6 +175,8 @@ def test_find_files_recursively(self): os.path.join(self.wd, 'dos.yml'), os.path.join(self.wd, 'empty.yml'), os.path.join(self.wd, 'no-yaml.json'), + os.path.join(self.wd, 'non-ascii/utf16'), + os.path.join(self.wd, 'non-ascii/utf8'), os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'), os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'), os.path.join(self.wd, 'sub/ok.yaml'), @@ -188,6 +194,8 @@ def test_find_files_recursively(self): os.path.join(self.wd, 'dos.yml'), os.path.join(self.wd, 'empty.yml'), os.path.join(self.wd, 'no-yaml.json'), + os.path.join(self.wd, 'non-ascii/utf16'), + os.path.join(self.wd, 'non-ascii/utf8'), os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'), os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'), os.path.join(self.wd, 'sub/ok.yaml'), From d60e9fd0f685fa3eda3b20ebf193a484a8ff9bd3 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Tue, 24 Mar 2020 22:19:01 -0700 Subject: [PATCH 05/14] don't read whole file to detect encoding --- yamllint/cli.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/yamllint/cli.py b/yamllint/cli.py index d3144812..d84c6f08 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -31,14 +31,14 @@ def determine_encoding(file): with io.open(file, 'rb') as raw_file: - data = raw_file.read() - if data.startswith(codecs.BOM_UTF16_LE): - encoding = 'utf-16-le' - elif data.startswith(codecs.BOM_UTF16_BE): - encoding = 'utf-16-be' - else: - encoding = 'utf-8' - return encoding + data = raw_file.read(4) + if data.startswith(codecs.BOM_UTF16_LE): + encoding = 'utf-16-le' + elif data.startswith(codecs.BOM_UTF16_BE): + encoding = 'utf-16-be' + else: + encoding = 'utf-8' + return encoding def find_files_recursively(items, conf): From b0d6d55877763fb258abc83818a5486a4796bfa7 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Tue, 24 Mar 2020 22:30:02 -0700 Subject: [PATCH 06/14] add encoding tests --- tests/test_cli.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 1c497fe5..8cb91bb8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -91,8 +91,10 @@ def setUpClass(cls): # dos line endings yaml 'dos.yml': '---\r\n' 'dos: true', - # UTF-16 BOM - 'non-ascii/utf16': b'\xff\xfe---\nutf16: true\n', + # UTF-16 Little Endian BOM + 'non-ascii/utf16le': b'\xff\xfe---\nutf16le: true\n', + #UTF-16 Big Endian + 'non-ascii/utf16be': b'\xfe\xff---\nutf16be: true\n', #UTF-8 BOM 'non-ascii/utf8': b'\xef\xbb\xbf---\nutf8: true\n', }) @@ -175,7 +177,8 @@ def test_find_files_recursively(self): os.path.join(self.wd, 'dos.yml'), os.path.join(self.wd, 'empty.yml'), os.path.join(self.wd, 'no-yaml.json'), - os.path.join(self.wd, 'non-ascii/utf16'), + os.path.join(self.wd, 'non-ascii/utf16be'), + os.path.join(self.wd, 'non-ascii/utf16le'), os.path.join(self.wd, 'non-ascii/utf8'), os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'), os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'), @@ -194,7 +197,8 @@ def test_find_files_recursively(self): os.path.join(self.wd, 'dos.yml'), os.path.join(self.wd, 'empty.yml'), os.path.join(self.wd, 'no-yaml.json'), - os.path.join(self.wd, 'non-ascii/utf16'), + os.path.join(self.wd, 'non-ascii/utf16be'), + os.path.join(self.wd, 'non-ascii/utf16le'), os.path.join(self.wd, 'non-ascii/utf8'), os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'), os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'), @@ -525,3 +529,18 @@ def test_run_non_universal_newline(self): '\n' % path) self.assertEqual( (ctx.returncode, ctx.stdout, ctx.stderr), (1, expected_out, '')) + + def test_encoding_detection_utf16le(self): + path = os.path.join(self.wd, 'non-ascii/utf16le') + encoding = cli.determine_encoding(path) + self.assertEqual(encoding, 'utf-16-le') + + def test_encoding_detection_utf16be(self): + path = os.path.join(self.wd, 'non-ascii/utf16be') + encoding = cli.determine_encoding(path) + self.assertEqual(encoding, 'utf-16-be') + + def test_encoding_detection_utf8(self): + path = os.path.join(self.wd, 'non-ascii/utf8') + encoding = cli.determine_encoding(path) + self.assertEqual(encoding, 'utf-8') From 4e6c030dadddae6fe9633600ff07653cba2b934e Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Tue, 24 Mar 2020 22:32:33 -0700 Subject: [PATCH 07/14] make flake8 happy --- tests/test_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 8cb91bb8..a671cdf2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -93,9 +93,9 @@ def setUpClass(cls): 'dos: true', # UTF-16 Little Endian BOM 'non-ascii/utf16le': b'\xff\xfe---\nutf16le: true\n', - #UTF-16 Big Endian + # UTF-16 Big Endian 'non-ascii/utf16be': b'\xfe\xff---\nutf16be: true\n', - #UTF-8 BOM + # UTF-8 BOM 'non-ascii/utf8': b'\xef\xbb\xbf---\nutf8: true\n', }) From a68a80143a449ceb2045163e3c3284b02c2f6eb7 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Fri, 3 Apr 2020 21:21:34 -0700 Subject: [PATCH 08/14] use chardet for encoding detection --- setup.py | 2 +- tests/test_cli.py | 19 +++++++++++-------- yamllint/cli.py | 26 +++++++++++++------------- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/setup.py b/setup.py index ffa2ee22..df6cfeb4 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,6 @@ packages=find_packages(exclude=['tests', 'tests.*']), entry_points={'console_scripts': ['yamllint=yamllint.cli:run']}, package_data={'yamllint': ['conf/*.yaml']}, - install_requires=['pathspec >=0.5.3', 'pyyaml'], + install_requires=['pathspec >=0.5.3', 'pyyaml', 'chardet'], test_suite='tests', ) diff --git a/tests/test_cli.py b/tests/test_cli.py index a671cdf2..76fef52d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -92,9 +92,9 @@ def setUpClass(cls): 'dos.yml': '---\r\n' 'dos: true', # UTF-16 Little Endian BOM - 'non-ascii/utf16le': b'\xff\xfe---\nutf16le: true\n', + 'non-ascii/utf16le': b'\xff\xfe' + u'---\nutf16le: true\n'.encode('utf-16-le'), # UTF-16 Big Endian - 'non-ascii/utf16be': b'\xfe\xff---\nutf16be: true\n', + 'non-ascii/utf16be': b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'), # UTF-8 BOM 'non-ascii/utf8': b'\xef\xbb\xbf---\nutf8: true\n', }) @@ -532,15 +532,18 @@ def test_run_non_universal_newline(self): def test_encoding_detection_utf16le(self): path = os.path.join(self.wd, 'non-ascii/utf16le') - encoding = cli.determine_encoding(path) - self.assertEqual(encoding, 'utf-16-le') + with RunContext(self) as ctx: + cli.run(('-f', 'parsable', path)) + self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) def test_encoding_detection_utf16be(self): path = os.path.join(self.wd, 'non-ascii/utf16be') - encoding = cli.determine_encoding(path) - self.assertEqual(encoding, 'utf-16-be') + with RunContext(self) as ctx: + cli.run(('-f', 'parsable', path)) + self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) def test_encoding_detection_utf8(self): path = os.path.join(self.wd, 'non-ascii/utf8') - encoding = cli.determine_encoding(path) - self.assertEqual(encoding, 'utf-8') + with RunContext(self) as ctx: + cli.run(('-f', 'parsable', path)) + self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) diff --git a/yamllint/cli.py b/yamllint/cli.py index d84c6f08..385c27f5 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -17,7 +17,8 @@ from __future__ import print_function import argparse -import codecs +import chardet +import contextlib import io import os import platform @@ -29,16 +30,16 @@ from yamllint.linter import PROBLEM_LEVELS -def determine_encoding(file): - with io.open(file, 'rb') as raw_file: - data = raw_file.read(4) - if data.startswith(codecs.BOM_UTF16_LE): - encoding = 'utf-16-le' - elif data.startswith(codecs.BOM_UTF16_BE): - encoding = 'utf-16-be' - else: - encoding = 'utf-8' - return encoding +@contextlib.contextmanager +def yamlopen(fp, **iowrapper_kwargs): + encoding = iowrapper_kwargs.pop('encoding', None) + with io.open(fp, mode='rb') as raw_file: + if encoding is None: + raw_data = raw_file.read() + encoding = chardet.detect(raw_data).get('encoding') or 'utf-8' + raw_file.seek(0) + with io.TextIOWrapper(raw_file, encoding=encoding, **iowrapper_kwargs) as decoded: + yield decoded def find_files_recursively(items, conf): @@ -190,8 +191,7 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - encoding = determine_encoding(file) - with io.open(file, newline='', encoding=encoding) as f: + with yamlopen(file, newline='') as f: problems = linter.run(f, conf, filepath) except EnvironmentError as e: print(e, file=sys.stderr) From efd597f00fb9252f9432f7bda0d4c33699a8f186 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Fri, 3 Apr 2020 21:38:57 -0700 Subject: [PATCH 09/14] make things ugly to make flake8 happy --- tests/test_cli.py | 26 ++++++++++++++------------ yamllint/cli.py | 8 +++++--- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 76fef52d..00c1fa3a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -92,11 +92,13 @@ def setUpClass(cls): 'dos.yml': '---\r\n' 'dos: true', # UTF-16 Little Endian BOM - 'non-ascii/utf16le': b'\xff\xfe' + u'---\nutf16le: true\n'.encode('utf-16-le'), + 'non-ascii/utf-16-le': + b'\xff\xfe' + u'---\nutf16le: true\n'.encode('utf-16-le'), # UTF-16 Big Endian - 'non-ascii/utf16be': b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'), + 'non-ascii/utf-16-be': + b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'), # UTF-8 BOM - 'non-ascii/utf8': b'\xef\xbb\xbf---\nutf8: true\n', + 'non-ascii/utf-8': b'\xef\xbb\xbf---\nutf8: true\n', }) @classmethod @@ -177,9 +179,9 @@ def test_find_files_recursively(self): os.path.join(self.wd, 'dos.yml'), os.path.join(self.wd, 'empty.yml'), os.path.join(self.wd, 'no-yaml.json'), - os.path.join(self.wd, 'non-ascii/utf16be'), - os.path.join(self.wd, 'non-ascii/utf16le'), - os.path.join(self.wd, 'non-ascii/utf8'), + os.path.join(self.wd, 'non-ascii/utf-16-be'), + os.path.join(self.wd, 'non-ascii/utf-16-le'), + os.path.join(self.wd, 'non-ascii/utf-8'), os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'), os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'), os.path.join(self.wd, 'sub/ok.yaml'), @@ -197,9 +199,9 @@ def test_find_files_recursively(self): os.path.join(self.wd, 'dos.yml'), os.path.join(self.wd, 'empty.yml'), os.path.join(self.wd, 'no-yaml.json'), - os.path.join(self.wd, 'non-ascii/utf16be'), - os.path.join(self.wd, 'non-ascii/utf16le'), - os.path.join(self.wd, 'non-ascii/utf8'), + os.path.join(self.wd, 'non-ascii/utf-16-be'), + os.path.join(self.wd, 'non-ascii/utf-16-le'), + os.path.join(self.wd, 'non-ascii/utf-8'), os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8'), os.path.join(self.wd, 's/s/s/s/s/s/s/s/s/s/s/s/s/s/s/file.yaml'), os.path.join(self.wd, 'sub/ok.yaml'), @@ -531,19 +533,19 @@ def test_run_non_universal_newline(self): (ctx.returncode, ctx.stdout, ctx.stderr), (1, expected_out, '')) def test_encoding_detection_utf16le(self): - path = os.path.join(self.wd, 'non-ascii/utf16le') + path = os.path.join(self.wd, 'non-ascii/utf-16-le') with RunContext(self) as ctx: cli.run(('-f', 'parsable', path)) self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) def test_encoding_detection_utf16be(self): - path = os.path.join(self.wd, 'non-ascii/utf16be') + path = os.path.join(self.wd, 'non-ascii/utf-16-be') with RunContext(self) as ctx: cli.run(('-f', 'parsable', path)) self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) def test_encoding_detection_utf8(self): - path = os.path.join(self.wd, 'non-ascii/utf8') + path = os.path.join(self.wd, 'non-ascii/utf-8') with RunContext(self) as ctx: cli.run(('-f', 'parsable', path)) self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) diff --git a/yamllint/cli.py b/yamllint/cli.py index 385c27f5..298d506e 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -17,13 +17,14 @@ from __future__ import print_function import argparse -import chardet import contextlib import io import os import platform import sys +import chardet + from yamllint import APP_DESCRIPTION, APP_NAME, APP_VERSION from yamllint import linter from yamllint.config import YamlLintConfig, YamlLintConfigError @@ -38,10 +39,11 @@ def yamlopen(fp, **iowrapper_kwargs): raw_data = raw_file.read() encoding = chardet.detect(raw_data).get('encoding') or 'utf-8' raw_file.seek(0) - with io.TextIOWrapper(raw_file, encoding=encoding, **iowrapper_kwargs) as decoded: + with io.TextIOWrapper( + raw_file, encoding=encoding, **iowrapper_kwargs + ) as decoded: yield decoded - def find_files_recursively(items, conf): for item in items: if os.path.isdir(item): From f3d5654171da0e426baae78b0702d0bf688f1759 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Fri, 3 Apr 2020 21:45:20 -0700 Subject: [PATCH 10/14] fLakE8 --- yamllint/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yamllint/cli.py b/yamllint/cli.py index 298d506e..9898672e 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -44,6 +44,7 @@ def yamlopen(fp, **iowrapper_kwargs): ) as decoded: yield decoded + def find_files_recursively(items, conf): for item in items: if os.path.isdir(item): From 940f95426b61e727ab51233f3382755ef7377ae7 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Fri, 3 Apr 2020 21:55:15 -0700 Subject: [PATCH 11/14] new filename now matches --- tests/test_cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 00c1fa3a..3b1e4c2b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -214,7 +214,8 @@ def test_find_files_recursively(self): ' - \'**/utf-8\'\n') self.assertEqual( sorted(cli.find_files_recursively([self.wd], conf)), - [os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8')] + [os.path.join(self.wd, 'non-ascii/utf-8'), + os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8')] ) def test_run_with_bad_arguments(self): From b6d4df4fd77e1fdbb27450382459abb856137c36 Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Fri, 3 Apr 2020 22:06:20 -0700 Subject: [PATCH 12/14] avoid reading the file twice --- yamllint/cli.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/yamllint/cli.py b/yamllint/cli.py index 9898672e..e413020e 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -33,16 +33,17 @@ @contextlib.contextmanager def yamlopen(fp, **iowrapper_kwargs): - encoding = iowrapper_kwargs.pop('encoding', None) with io.open(fp, mode='rb') as raw_file: - if encoding is None: + if iowrapper_kwargs.get('encoding'): + with io.TextIOWrapper(raw_file, **iowrapper_kwargs) as decoded: + yield decoded + else: raw_data = raw_file.read() encoding = chardet.detect(raw_data).get('encoding') or 'utf-8' - raw_file.seek(0) - with io.TextIOWrapper( - raw_file, encoding=encoding, **iowrapper_kwargs - ) as decoded: - yield decoded + iowrapper_kwargs['encoding'] = encoding + buffer = io.BytesIO(raw_data) + with io.TextIOWrapper(buffer, **iowrapper_kwargs) as decoded: + yield decoded def find_files_recursively(items, conf): From 5ad0ebf23b40f1fdfdd2c1630a92223518c0537c Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Fri, 3 Apr 2020 22:11:13 -0700 Subject: [PATCH 13/14] dont shadow builtin names --- yamllint/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yamllint/cli.py b/yamllint/cli.py index e413020e..0d36d75a 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -41,8 +41,8 @@ def yamlopen(fp, **iowrapper_kwargs): raw_data = raw_file.read() encoding = chardet.detect(raw_data).get('encoding') or 'utf-8' iowrapper_kwargs['encoding'] = encoding - buffer = io.BytesIO(raw_data) - with io.TextIOWrapper(buffer, **iowrapper_kwargs) as decoded: + buff = io.BytesIO(raw_data) + with io.TextIOWrapper(buff, **iowrapper_kwargs) as decoded: yield decoded From 15f7cc39d6e5f4965cecdb4e8fc6b2dbc421957f Mon Sep 17 00:00:00 2001 From: Spencer Young Date: Fri, 3 Apr 2020 22:39:32 -0700 Subject: [PATCH 14/14] tests --- tests/test_cli.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 3b1e4c2b..57ee236e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -99,6 +99,8 @@ def setUpClass(cls): b'\xfe\xff' + u'---\nutf16be: true\n'.encode('utf-16-be'), # UTF-8 BOM 'non-ascii/utf-8': b'\xef\xbb\xbf---\nutf8: true\n', + # Random bytes that have no possible encoding + 'non-ascii/undetectable': b'\x05\xfc\x17A\xb6\x15\x15\x90>9' }) @classmethod @@ -179,6 +181,7 @@ def test_find_files_recursively(self): os.path.join(self.wd, 'dos.yml'), os.path.join(self.wd, 'empty.yml'), os.path.join(self.wd, 'no-yaml.json'), + os.path.join(self.wd, 'non-ascii/undetectable'), os.path.join(self.wd, 'non-ascii/utf-16-be'), os.path.join(self.wd, 'non-ascii/utf-16-le'), os.path.join(self.wd, 'non-ascii/utf-8'), @@ -199,6 +202,7 @@ def test_find_files_recursively(self): os.path.join(self.wd, 'dos.yml'), os.path.join(self.wd, 'empty.yml'), os.path.join(self.wd, 'no-yaml.json'), + os.path.join(self.wd, 'non-ascii/undetectable'), os.path.join(self.wd, 'non-ascii/utf-16-be'), os.path.join(self.wd, 'non-ascii/utf-16-le'), os.path.join(self.wd, 'non-ascii/utf-8'), @@ -550,3 +554,37 @@ def test_encoding_detection_utf8(self): with RunContext(self) as ctx: cli.run(('-f', 'parsable', path)) self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, '', '')) + + def test_detected_encoding_utf8(self): + path = os.path.join(self.wd, 'non-ascii/éçäγλνπ¥/utf-8') + with cli.yamlopen(path) as yaml_file: + yaml_file.read() + self.assertEqual(yaml_file.encoding, 'utf-8') + + def test_detected_encoding_utf8_sig(self): + path = os.path.join(self.wd, 'non-ascii/utf-8') + with cli.yamlopen(path) as yaml_file: + yaml_file.read() + self.assertEqual(yaml_file.encoding, 'UTF-8-SIG') + + def test_detected_encoding_utf16(self): + path = os.path.join(self.wd, 'non-ascii/utf-16-le') + with cli.yamlopen(path) as yaml_file: + yaml_file.read() + self.assertEqual(yaml_file.encoding, 'UTF-16') + path = os.path.join(self.wd, 'non-ascii/utf-16-be') + with cli.yamlopen(path) as yaml_file: + yaml_file.read() + self.assertEqual(yaml_file.encoding, 'UTF-16') + + def test_explicit_encoding(self): + path = os.path.join(self.wd, 'a.yaml') + with cli.yamlopen(path, encoding='windows-1252') as yaml_file: + yaml_file.read() + self.assertEqual(yaml_file.encoding, 'windows-1252') + + def test_default_encoding(self): + path = os.path.join(self.wd, 'non-ascii/undetectable') + with cli.yamlopen(path) as yaml_file: + encoding = yaml_file.encoding + self.assertEqual(encoding, 'utf-8')