From a99cb8b1ffea1d2cee59fcc72eb850426ec357aa Mon Sep 17 00:00:00 2001 From: kbkpbot Date: Fri, 14 Feb 2025 14:37:38 +0800 Subject: [PATCH 1/5] checker: check for invalid utf8 string --- vlib/v/checker/str.v | 5 +++++ vlib/v/tests/skip_unused/invalid_utf8_string.run.out | 4 ++++ .../skip_unused/invalid_utf8_string.skip_unused.run.out | 4 ++++ vlib/v/tests/skip_unused/invalid_utf8_string.vv | 2 ++ 4 files changed, 15 insertions(+) create mode 100644 vlib/v/tests/skip_unused/invalid_utf8_string.run.out create mode 100644 vlib/v/tests/skip_unused/invalid_utf8_string.skip_unused.run.out create mode 100644 vlib/v/tests/skip_unused/invalid_utf8_string.vv diff --git a/vlib/v/checker/str.v b/vlib/v/checker/str.v index 1d84f234d65307..b6d43d8916edcb 100644 --- a/vlib/v/checker/str.v +++ b/vlib/v/checker/str.v @@ -5,6 +5,7 @@ module checker import v.ast import v.token +import encoding.utf8 fn (mut c Checker) get_default_fmt(ftyp ast.Type, typ ast.Type) u8 { if ftyp.has_option_or_result() { @@ -131,6 +132,10 @@ const unicode_lit_overflow_message = 'unicode character exceeds max allowed valu // https://stackoverflow.com/questions/52203351/why-unicode-is-restricted-to-0x10ffff @[direct_array_access] fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type { + valid_utf8 := utf8.validate(node.val.str, node.val.len) + if valid_utf8 == false { + c.warn("invalid utf8 string, please check your file's encoding is utf8", node.pos) + } mut idx := 0 for idx < node.val.len { match node.val[idx] { diff --git a/vlib/v/tests/skip_unused/invalid_utf8_string.run.out b/vlib/v/tests/skip_unused/invalid_utf8_string.run.out new file mode 100644 index 00000000000000..8cccc52f9e027b --- /dev/null +++ b/vlib/v/tests/skip_unused/invalid_utf8_string.run.out @@ -0,0 +1,4 @@ +invalid_utf8_string.vv:1:6: warning: invalid utf8 string, please check your file's encoding is utf8 + 1 | x := '²âÊÔtxt' + | ~~~~~~~ + 2 | println(x) diff --git a/vlib/v/tests/skip_unused/invalid_utf8_string.skip_unused.run.out b/vlib/v/tests/skip_unused/invalid_utf8_string.skip_unused.run.out new file mode 100644 index 00000000000000..8cccc52f9e027b --- /dev/null +++ b/vlib/v/tests/skip_unused/invalid_utf8_string.skip_unused.run.out @@ -0,0 +1,4 @@ +invalid_utf8_string.vv:1:6: warning: invalid utf8 string, please check your file's encoding is utf8 + 1 | x := '²âÊÔtxt' + | ~~~~~~~ + 2 | println(x) diff --git a/vlib/v/tests/skip_unused/invalid_utf8_string.vv b/vlib/v/tests/skip_unused/invalid_utf8_string.vv new file mode 100644 index 00000000000000..0099cc3abf7fdd --- /dev/null +++ b/vlib/v/tests/skip_unused/invalid_utf8_string.vv @@ -0,0 +1,2 @@ +x := '²âÊÔtxt' +println(x) From 1b1f2ea86dfdb5011a20ac3f8bb952b9c730bf49 Mon Sep 17 00:00:00 2001 From: kbkpbot Date: Fri, 14 Feb 2025 15:08:19 +0800 Subject: [PATCH 2/5] move test --- vlib/v/checker/tests/invalid_utf8_string.out | 3 +++ vlib/v/checker/tests/invalid_utf8_string.vv | 1 + vlib/v/tests/skip_unused/invalid_utf8_string.run.out | 4 ---- .../tests/skip_unused/invalid_utf8_string.skip_unused.run.out | 4 ---- vlib/v/tests/skip_unused/invalid_utf8_string.vv | 2 -- 5 files changed, 4 insertions(+), 10 deletions(-) create mode 100644 vlib/v/checker/tests/invalid_utf8_string.out create mode 100644 vlib/v/checker/tests/invalid_utf8_string.vv delete mode 100644 vlib/v/tests/skip_unused/invalid_utf8_string.run.out delete mode 100644 vlib/v/tests/skip_unused/invalid_utf8_string.skip_unused.run.out delete mode 100644 vlib/v/tests/skip_unused/invalid_utf8_string.vv diff --git a/vlib/v/checker/tests/invalid_utf8_string.out b/vlib/v/checker/tests/invalid_utf8_string.out new file mode 100644 index 00000000000000..c67a9e8ae19b71 --- /dev/null +++ b/vlib/v/checker/tests/invalid_utf8_string.out @@ -0,0 +1,3 @@ +vlib/v/checker/tests/invalid_utf8_string.vv:1:6: warning: invalid utf8 string, please check your file's encoding is utf8 + 1 | _ := '²âÊÔtxt' + | ~~~~~~~ diff --git a/vlib/v/checker/tests/invalid_utf8_string.vv b/vlib/v/checker/tests/invalid_utf8_string.vv new file mode 100644 index 00000000000000..e89dd4bcfe0d46 --- /dev/null +++ b/vlib/v/checker/tests/invalid_utf8_string.vv @@ -0,0 +1 @@ +_ := '²âÊÔtxt' diff --git a/vlib/v/tests/skip_unused/invalid_utf8_string.run.out b/vlib/v/tests/skip_unused/invalid_utf8_string.run.out deleted file mode 100644 index 8cccc52f9e027b..00000000000000 --- a/vlib/v/tests/skip_unused/invalid_utf8_string.run.out +++ /dev/null @@ -1,4 +0,0 @@ -invalid_utf8_string.vv:1:6: warning: invalid utf8 string, please check your file's encoding is utf8 - 1 | x := '²âÊÔtxt' - | ~~~~~~~ - 2 | println(x) diff --git a/vlib/v/tests/skip_unused/invalid_utf8_string.skip_unused.run.out b/vlib/v/tests/skip_unused/invalid_utf8_string.skip_unused.run.out deleted file mode 100644 index 8cccc52f9e027b..00000000000000 --- a/vlib/v/tests/skip_unused/invalid_utf8_string.skip_unused.run.out +++ /dev/null @@ -1,4 +0,0 @@ -invalid_utf8_string.vv:1:6: warning: invalid utf8 string, please check your file's encoding is utf8 - 1 | x := '²âÊÔtxt' - | ~~~~~~~ - 2 | println(x) diff --git a/vlib/v/tests/skip_unused/invalid_utf8_string.vv b/vlib/v/tests/skip_unused/invalid_utf8_string.vv deleted file mode 100644 index 0099cc3abf7fdd..00000000000000 --- a/vlib/v/tests/skip_unused/invalid_utf8_string.vv +++ /dev/null @@ -1,2 +0,0 @@ -x := '²âÊÔtxt' -println(x) From 1aee6adbdb34cf9f678092ff88c155b355a3a7a0 Mon Sep 17 00:00:00 2001 From: kbkpbot Date: Fri, 14 Feb 2025 16:54:53 +0800 Subject: [PATCH 3/5] Update vlib/v/checker/str.v Co-authored-by: Swastik Baranwal --- vlib/v/checker/str.v | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vlib/v/checker/str.v b/vlib/v/checker/str.v index b6d43d8916edcb..ac05680e693da6 100644 --- a/vlib/v/checker/str.v +++ b/vlib/v/checker/str.v @@ -133,7 +133,7 @@ const unicode_lit_overflow_message = 'unicode character exceeds max allowed valu @[direct_array_access] fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type { valid_utf8 := utf8.validate(node.val.str, node.val.len) - if valid_utf8 == false { + if !valid_utf8 { c.warn("invalid utf8 string, please check your file's encoding is utf8", node.pos) } mut idx := 0 From dfc9a4fa0eb76a557b34c9cae14ff75671a77482 Mon Sep 17 00:00:00 2001 From: Delyan Angelov Date: Sat, 15 Feb 2025 16:39:38 +0200 Subject: [PATCH 4/5] extract `vlib/encoding/utf8/validate`, use it directly to avoid importing the whole of `encoding.utf8`, which has a lot of unrelated tables etc --- vlib/encoding/utf8/encoding_utf8_test.v | 9 -- vlib/encoding/utf8/utf8.v | 83 +---------------- .../utf8/validate/encoding_utf8_test.v | 9 ++ vlib/encoding/utf8/validate/validate_utf8.v | 90 +++++++++++++++++++ vlib/v/checker/str.v | 4 +- 5 files changed, 104 insertions(+), 91 deletions(-) delete mode 100644 vlib/encoding/utf8/encoding_utf8_test.v create mode 100644 vlib/encoding/utf8/validate/encoding_utf8_test.v create mode 100644 vlib/encoding/utf8/validate/validate_utf8.v diff --git a/vlib/encoding/utf8/encoding_utf8_test.v b/vlib/encoding/utf8/encoding_utf8_test.v deleted file mode 100644 index ebea87ca22b739..00000000000000 --- a/vlib/encoding/utf8/encoding_utf8_test.v +++ /dev/null @@ -1,9 +0,0 @@ -import encoding.utf8 - -fn test_validate_str() { - assert utf8.validate_str('añçá') == true - assert utf8.validate_str('\x61\xC3\xB1\xC3\xA7\xC3\xA1') == true - assert utf8.validate_str('\xC0\xC1') == false - assert utf8.validate_str('\xF5\xFF') == false - assert utf8.validate_str('\xE0\xEF') == false -} diff --git a/vlib/encoding/utf8/utf8.v b/vlib/encoding/utf8/utf8.v index b7f8195310f769..b551b78ae2a6ea 100644 --- a/vlib/encoding/utf8/utf8.v +++ b/vlib/encoding/utf8/utf8.v @@ -1,90 +1,13 @@ module utf8 -struct Utf8State { -mut: - index int - subindex int - failed bool -} +import encoding.utf8.validate as impl // validate_str reports if str consists of valid UTF-8 runes pub fn validate_str(str string) bool { - return validate(str.str, str.len) + return impl.utf8_string(str) } // validate reports if data consists of valid UTF-8 runes pub fn validate(data &u8, len int) bool { - mut state := Utf8State{} - for i := 0; i < len; i++ { - s := unsafe { data[i] } - if s == 0 { - break - } - state.next_state(s) - if state.failed { - return false - } - } - return !state.failed && state.subindex <= 0 -} - -fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool { - if s.subindex == 0 || (s.index > 1 && s.subindex == 1) || (s.index >= 6 && s.subindex == 2) { - if (s.subindex == 0 && r0) || (s.subindex == 1 && r1) || (s.subindex == 2 && is_tail) { - s.subindex++ - return true - } - } else { - s.failed = true - if is_tail { - s.index = 0 - s.subindex = 0 - s.failed = false - } - return true - } - s.index++ - s.subindex = 0 - return false -} - -fn (mut s Utf8State) next_state(c u8) { - // sequence 1 - if s.index == 0 { - if (c >= 0x00 + 1 && c <= 0x7F) || c == 0x00 { - return - } - s.index++ - s.subindex = 0 - } - is_tail := c >= 0x80 && c <= 0xBF - // sequence 2 - if s.index == 1 && s.seq(c >= 0xC2 && c <= 0xDF, false, is_tail) { - return - } - // sequence 3 - if s.index == 2 && s.seq(c == 0xE0, c >= 0xA0 && c <= 0xBF, is_tail) { - return - } - if s.index == 3 && s.seq(c >= 0xE1 && c <= 0xEC, c >= 0x80 && c <= 0xBF, is_tail) { - return - } - if s.index == 4 && s.seq(c == 0xED, c >= 0x80 && c <= 0x9F, is_tail) { - return - } - if s.index == 5 && s.seq(c >= 0xEE && c <= 0xEF, c >= 0x80 && c <= 0xBF, is_tail) { - return - } - // sequence 4 - if s.index == 6 && s.seq(c == 0xF0, c >= 0x90 && c <= 0xBF, is_tail) { - return - } - if s.index == 7 && s.seq(c >= 0xF1 && c <= 0xF3, c >= 0x80 && c <= 0xBF, is_tail) { - return - } - if s.index == 8 && s.seq(c == 0xF4, c >= 0x80 && c <= 0x8F, is_tail) { - return - } - // we should never reach here - s.failed = true + return impl.utf8_data(data, len) } diff --git a/vlib/encoding/utf8/validate/encoding_utf8_test.v b/vlib/encoding/utf8/validate/encoding_utf8_test.v new file mode 100644 index 00000000000000..fe085a5d1f5c2d --- /dev/null +++ b/vlib/encoding/utf8/validate/encoding_utf8_test.v @@ -0,0 +1,9 @@ +import encoding.utf8.validate + +fn test_validate_str() { + assert validate.utf8_string('añçá') == true + assert validate.utf8_string('\x61\xC3\xB1\xC3\xA7\xC3\xA1') == true + assert validate.utf8_string('\xC0\xC1') == false + assert validate.utf8_string('\xF5\xFF') == false + assert validate.utf8_string('\xE0\xEF') == false +} diff --git a/vlib/encoding/utf8/validate/validate_utf8.v b/vlib/encoding/utf8/validate/validate_utf8.v new file mode 100644 index 00000000000000..44e42ea6eca61c --- /dev/null +++ b/vlib/encoding/utf8/validate/validate_utf8.v @@ -0,0 +1,90 @@ +module validate + +struct Utf8State { +mut: + index int + subindex int + failed bool +} + +// utf8_string returns true, if the given string `s` consists only of valid UTF-8 runes +pub fn utf8_string(s string) bool { + return utf8_data(s.str, s.len) +} + +// utf8_data returns true, if the given `data` block, with length `len` bytes, consists only of valid UTF-8 runes +pub fn utf8_data(data &u8, len int) bool { + mut state := Utf8State{} + for i := 0; i < len; i++ { + s := unsafe { data[i] } + if s == 0 { + break + } + state.next_state(s) + if state.failed { + return false + } + } + return !state.failed && state.subindex <= 0 +} + +fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool { + if s.subindex == 0 || (s.index > 1 && s.subindex == 1) || (s.index >= 6 && s.subindex == 2) { + if (s.subindex == 0 && r0) || (s.subindex == 1 && r1) || (s.subindex == 2 && is_tail) { + s.subindex++ + return true + } + } else { + s.failed = true + if is_tail { + s.index = 0 + s.subindex = 0 + s.failed = false + } + return true + } + s.index++ + s.subindex = 0 + return false +} + +fn (mut s Utf8State) next_state(c u8) { + // sequence 1 + if s.index == 0 { + if (c >= 0x00 + 1 && c <= 0x7F) || c == 0x00 { + return + } + s.index++ + s.subindex = 0 + } + is_tail := c >= 0x80 && c <= 0xBF + // sequence 2 + if s.index == 1 && s.seq(c >= 0xC2 && c <= 0xDF, false, is_tail) { + return + } + // sequence 3 + if s.index == 2 && s.seq(c == 0xE0, c >= 0xA0 && c <= 0xBF, is_tail) { + return + } + if s.index == 3 && s.seq(c >= 0xE1 && c <= 0xEC, c >= 0x80 && c <= 0xBF, is_tail) { + return + } + if s.index == 4 && s.seq(c == 0xED, c >= 0x80 && c <= 0x9F, is_tail) { + return + } + if s.index == 5 && s.seq(c >= 0xEE && c <= 0xEF, c >= 0x80 && c <= 0xBF, is_tail) { + return + } + // sequence 4 + if s.index == 6 && s.seq(c == 0xF0, c >= 0x90 && c <= 0xBF, is_tail) { + return + } + if s.index == 7 && s.seq(c >= 0xF1 && c <= 0xF3, c >= 0x80 && c <= 0xBF, is_tail) { + return + } + if s.index == 8 && s.seq(c == 0xF4, c >= 0x80 && c <= 0x8F, is_tail) { + return + } + // we should never reach here + s.failed = true +} diff --git a/vlib/v/checker/str.v b/vlib/v/checker/str.v index ac05680e693da6..9209ac7d2c2d09 100644 --- a/vlib/v/checker/str.v +++ b/vlib/v/checker/str.v @@ -5,7 +5,7 @@ module checker import v.ast import v.token -import encoding.utf8 +import encoding.utf8.validate fn (mut c Checker) get_default_fmt(ftyp ast.Type, typ ast.Type) u8 { if ftyp.has_option_or_result() { @@ -132,7 +132,7 @@ const unicode_lit_overflow_message = 'unicode character exceeds max allowed valu // https://stackoverflow.com/questions/52203351/why-unicode-is-restricted-to-0x10ffff @[direct_array_access] fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type { - valid_utf8 := utf8.validate(node.val.str, node.val.len) + valid_utf8 := validate.utf8_string(node.val) if !valid_utf8 { c.warn("invalid utf8 string, please check your file's encoding is utf8", node.pos) } From 020ae2c2a88411e08b76f1144a051cf0e8cdf30a Mon Sep 17 00:00:00 2001 From: Delyan Angelov Date: Sat, 15 Feb 2025 19:29:35 +0200 Subject: [PATCH 5/5] update the reference to the moved test in cmd/tools/vtest-self.v too --- cmd/tools/vtest-self.v | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/tools/vtest-self.v b/cmd/tools/vtest-self.v index cf9f16e3d8277b..504a1e6bc3046a 100644 --- a/cmd/tools/vtest-self.v +++ b/cmd/tools/vtest-self.v @@ -50,7 +50,7 @@ const essential_list = [ 'vlib/crypto/md5/md5_test.v', 'vlib/dl/dl_test.v', 'vlib/encoding/base64/base64_test.v', - 'vlib/encoding/utf8/encoding_utf8_test.v', + 'vlib/encoding/utf8/validate/encoding_utf8_test.v', 'vlib/encoding/utf8/utf8_util_test.v', 'vlib/flag/flag_test.v', 'vlib/json/tests/json_decode_test.v',