Skip to content

Commit

Permalink
Merge pull request #595 from greenbone/improve_encoding_detection
Browse files Browse the repository at this point in the history
Improve encoding detection
  • Loading branch information
n-thumann authored Aug 1, 2023
2 parents b724d71 + 2c620fe commit c53a4e9
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 14 deletions.
86 changes: 85 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ python-magic = "^0.4.25"
chardet = ">=4,<6"
validators = ">=0.18.2,<0.21.0"
gitpython = "^3.1.31"
charset-normalizer = "^3.2.0"

[tool.poetry.dev-dependencies]
autohooks = ">=21.7.0"
Expand Down
6 changes: 4 additions & 2 deletions tests/plugins/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ def test_some_invalid_characters(self):

self.assertIsInstance(results[0], LinterError)
self.assertEqual(
"VT uses a wrong encoding. Detected encoding is utf-8.",
"VT uses a wrong encoding. "
"Allowed encodings are ascii, latin_1.",
results[0].message,
)
self.assertEqual(
Expand Down Expand Up @@ -97,6 +98,7 @@ def test_invisible_whitespace(self):
self.assertEqual(len(results), 1)
self.assertIsInstance(results[0], LinterError)
self.assertEqual(
"VT uses a wrong encoding. Detected encoding is utf-8.",
"VT uses a wrong encoding. "
"Allowed encodings are ascii, latin_1.",
results[0].message,
)
20 changes: 9 additions & 11 deletions troubadix/plugins/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from pathlib import Path
from typing import Iterable, Iterator

import chardet
import charset_normalizer

from troubadix.plugin import LineContentPlugin, LinterError, LinterResult

Expand All @@ -28,6 +28,8 @@
# Temporary only check for chars in between 7f-9f, like in the old Feed-QA...
CHAR_SET = r"[\x7F-\x9F]"

ALLOWED_ENCODINGS = ["ascii", "latin_1"]


class CheckEncoding(LineContentPlugin):
name = "check_encoding"
Expand All @@ -37,18 +39,14 @@ def check_lines(
nasl_file: Path,
lines: Iterable[str],
) -> Iterator[LinterResult]:
content = nasl_file.read_bytes()
match = charset_normalizer.from_path(
nasl_file, threshold=0.4, cp_isolation=ALLOWED_ENCODINGS
).best()

detection = chardet.detect(content)
encoding = detection.get("encoding")
if encoding and encoding not in [
"ascii",
"latin1",
"ISO-8859-1",
]:
if not match:
yield LinterError(
f"VT uses a wrong encoding. Detected "
f"encoding is {encoding}.",
f"VT uses a wrong encoding. "
f"Allowed encodings are {', '.join(ALLOWED_ENCODINGS)}.",
file=nasl_file,
plugin=self.name,
)
Expand Down

0 comments on commit c53a4e9

Please sign in to comment.