From 8382f9d2e33ddba752a3458f42426048e3274ce0 Mon Sep 17 00:00:00 2001 From: Ben Grande Date: Thu, 14 Mar 2024 12:09:49 +0100 Subject: [PATCH] feat: print hex of unicode Useful to detect unwanted characters in third party contributions patches using a CI hook. --- .pre-commit-config.yaml | 11 +++++++++ scripts/unicode-prohibit.sh | 45 +++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100755 scripts/unicode-prohibit.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 511c8b68..07168576 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,6 +6,14 @@ repos: - repo: local hooks: + + - id: unicode-prohibit + name: unicode-prohibit + entry: scripts/unicode-prohibit.sh + language: script + pass_filenames: true + description: Prohibit Unicode + - id: reuse name: reuse entry: reuse @@ -13,18 +21,21 @@ repos: language: python pass_filenames: false description: Lint files to comply with the REUSE Specification + - id: salt-lint name: salt-lint entry: scripts/salt-lint.sh language: script pass_filenames: true description: Lint Salt files + - id: shellcheck name: shellcheck entry: scripts/shell-lint.sh language: script pass_filenames: true description: Lint Shellscripts + - id: gitlint name: gitlint language: python diff --git a/scripts/unicode-prohibit.sh b/scripts/unicode-prohibit.sh new file mode 100755 index 00000000..6dbed6f3 --- /dev/null +++ b/scripts/unicode-prohibit.sh @@ -0,0 +1,45 @@ +#!/bin/sh + +## SPDX-FileCopyrightText: 2024 Benjamin Grande M. S. +## +## SPDX-License-Identifier: AGPL-3.0-or-later +## +## Finds Unicode recursively and prints in hexadecimal format. + +set -eu + +command -v git >/dev/null || + { printf "Missing program: git\n" >&2; exit 1; } +cd "$(git rev-parse --show-toplevel)" || exit 1 + +files="" +if test -n "${1-}"; then + files="${*}" + if test -z "${files}"; then + exit 0 + fi +fi + +files="$(echo "${files}" | sort -u)" +# shellcheck disable=SC2086 +unicode_match="$(grep -oPrHn --exclude-dir=.git --exclude-dir=LICENSES \ + -e "[^\x00-\x7F]" -- ${files} || true)" + +match_found="" +if test -n "${unicode_match}"; then + for line in ${unicode_match}; do + line_file="$(echo "${line}" | cut -d ":" -f1)" + case "${line_file}" in + git/*|LICENSES/*|.reuse/dep5|*.asc) continue;; + esac + line_number="$(echo "${line}" | cut -d ":" -f2)" + line_unicode="$(echo "${line}" | cut -d ":" -f3 | od -A n -vt c)" + echo "${line_file}:${line_number}:${line_unicode}" + match_found="1" + done + if test "${match_found}" = 1; then + exit 1 + fi +fi + +exit 0