Skip to content

Commit

Permalink
checker: check and error for invalid utf8 string literals (#23721)
Browse files Browse the repository at this point in the history
  • Loading branch information
kbkpbot authored Feb 15, 2025
1 parent b418230 commit f7841e1
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 90 deletions.
2 changes: 1 addition & 1 deletion cmd/tools/vtest-self.v
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ const essential_list = [
'vlib/crypto/md5/md5_test.v',
'vlib/dl/dl_test.v',
'vlib/encoding/base64/base64_test.v',
'vlib/encoding/utf8/encoding_utf8_test.v',
'vlib/encoding/utf8/validate/encoding_utf8_test.v',
'vlib/encoding/utf8/utf8_util_test.v',
'vlib/flag/flag_test.v',
'vlib/json/tests/json_decode_test.v',
Expand Down
9 changes: 0 additions & 9 deletions vlib/encoding/utf8/encoding_utf8_test.v

This file was deleted.

83 changes: 3 additions & 80 deletions vlib/encoding/utf8/utf8.v
Original file line number Diff line number Diff line change
@@ -1,90 +1,13 @@
module utf8

struct Utf8State {
mut:
index int
subindex int
failed bool
}
import encoding.utf8.validate as impl

// validate_str reports if str consists of valid UTF-8 runes
pub fn validate_str(str string) bool {
return validate(str.str, str.len)
return impl.utf8_string(str)
}

// validate reports if data consists of valid UTF-8 runes
pub fn validate(data &u8, len int) bool {
mut state := Utf8State{}
for i := 0; i < len; i++ {
s := unsafe { data[i] }
if s == 0 {
break
}
state.next_state(s)
if state.failed {
return false
}
}
return !state.failed && state.subindex <= 0
}

fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool {
if s.subindex == 0 || (s.index > 1 && s.subindex == 1) || (s.index >= 6 && s.subindex == 2) {
if (s.subindex == 0 && r0) || (s.subindex == 1 && r1) || (s.subindex == 2 && is_tail) {
s.subindex++
return true
}
} else {
s.failed = true
if is_tail {
s.index = 0
s.subindex = 0
s.failed = false
}
return true
}
s.index++
s.subindex = 0
return false
}

fn (mut s Utf8State) next_state(c u8) {
// sequence 1
if s.index == 0 {
if (c >= 0x00 + 1 && c <= 0x7F) || c == 0x00 {
return
}
s.index++
s.subindex = 0
}
is_tail := c >= 0x80 && c <= 0xBF
// sequence 2
if s.index == 1 && s.seq(c >= 0xC2 && c <= 0xDF, false, is_tail) {
return
}
// sequence 3
if s.index == 2 && s.seq(c == 0xE0, c >= 0xA0 && c <= 0xBF, is_tail) {
return
}
if s.index == 3 && s.seq(c >= 0xE1 && c <= 0xEC, c >= 0x80 && c <= 0xBF, is_tail) {
return
}
if s.index == 4 && s.seq(c == 0xED, c >= 0x80 && c <= 0x9F, is_tail) {
return
}
if s.index == 5 && s.seq(c >= 0xEE && c <= 0xEF, c >= 0x80 && c <= 0xBF, is_tail) {
return
}
// sequence 4
if s.index == 6 && s.seq(c == 0xF0, c >= 0x90 && c <= 0xBF, is_tail) {
return
}
if s.index == 7 && s.seq(c >= 0xF1 && c <= 0xF3, c >= 0x80 && c <= 0xBF, is_tail) {
return
}
if s.index == 8 && s.seq(c == 0xF4, c >= 0x80 && c <= 0x8F, is_tail) {
return
}
// we should never reach here
s.failed = true
return impl.utf8_data(data, len)
}
9 changes: 9 additions & 0 deletions vlib/encoding/utf8/validate/encoding_utf8_test.v
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import encoding.utf8.validate

fn test_validate_str() {
assert validate.utf8_string('añçá') == true
assert validate.utf8_string('\x61\xC3\xB1\xC3\xA7\xC3\xA1') == true
assert validate.utf8_string('\xC0\xC1') == false
assert validate.utf8_string('\xF5\xFF') == false
assert validate.utf8_string('\xE0\xEF') == false
}
90 changes: 90 additions & 0 deletions vlib/encoding/utf8/validate/validate_utf8.v
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
module validate

struct Utf8State {
mut:
index int
subindex int
failed bool
}

// utf8_string returns true, if the given string `s` consists only of valid UTF-8 runes
pub fn utf8_string(s string) bool {
return utf8_data(s.str, s.len)
}

// utf8_data returns true, if the given `data` block, with length `len` bytes, consists only of valid UTF-8 runes
pub fn utf8_data(data &u8, len int) bool {
mut state := Utf8State{}
for i := 0; i < len; i++ {
s := unsafe { data[i] }
if s == 0 {
break
}
state.next_state(s)
if state.failed {
return false
}
}
return !state.failed && state.subindex <= 0
}

fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool {
if s.subindex == 0 || (s.index > 1 && s.subindex == 1) || (s.index >= 6 && s.subindex == 2) {
if (s.subindex == 0 && r0) || (s.subindex == 1 && r1) || (s.subindex == 2 && is_tail) {
s.subindex++
return true
}
} else {
s.failed = true
if is_tail {
s.index = 0
s.subindex = 0
s.failed = false
}
return true
}
s.index++
s.subindex = 0
return false
}

fn (mut s Utf8State) next_state(c u8) {
// sequence 1
if s.index == 0 {
if (c >= 0x00 + 1 && c <= 0x7F) || c == 0x00 {
return
}
s.index++
s.subindex = 0
}
is_tail := c >= 0x80 && c <= 0xBF
// sequence 2
if s.index == 1 && s.seq(c >= 0xC2 && c <= 0xDF, false, is_tail) {
return
}
// sequence 3
if s.index == 2 && s.seq(c == 0xE0, c >= 0xA0 && c <= 0xBF, is_tail) {
return
}
if s.index == 3 && s.seq(c >= 0xE1 && c <= 0xEC, c >= 0x80 && c <= 0xBF, is_tail) {
return
}
if s.index == 4 && s.seq(c == 0xED, c >= 0x80 && c <= 0x9F, is_tail) {
return
}
if s.index == 5 && s.seq(c >= 0xEE && c <= 0xEF, c >= 0x80 && c <= 0xBF, is_tail) {
return
}
// sequence 4
if s.index == 6 && s.seq(c == 0xF0, c >= 0x90 && c <= 0xBF, is_tail) {
return
}
if s.index == 7 && s.seq(c >= 0xF1 && c <= 0xF3, c >= 0x80 && c <= 0xBF, is_tail) {
return
}
if s.index == 8 && s.seq(c == 0xF4, c >= 0x80 && c <= 0x8F, is_tail) {
return
}
// we should never reach here
s.failed = true
}
5 changes: 5 additions & 0 deletions vlib/v/checker/str.v
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ module checker

import v.ast
import v.token
import encoding.utf8.validate

fn (mut c Checker) get_default_fmt(ftyp ast.Type, typ ast.Type) u8 {
if ftyp.has_option_or_result() {
Expand Down Expand Up @@ -131,6 +132,10 @@ const unicode_lit_overflow_message = 'unicode character exceeds max allowed valu
// https://stackoverflow.com/questions/52203351/why-unicode-is-restricted-to-0x10ffff
@[direct_array_access]
fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
valid_utf8 := validate.utf8_string(node.val)
if !valid_utf8 {
c.warn("invalid utf8 string, please check your file's encoding is utf8", node.pos)
}
mut idx := 0
for idx < node.val.len {
match node.val[idx] {
Expand Down
3 changes: 3 additions & 0 deletions vlib/v/checker/tests/invalid_utf8_string.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
vlib/v/checker/tests/invalid_utf8_string.vv:1:6: warning: invalid utf8 string, please check your file's encoding is utf8
1 | _ := '����txt'
| ~~~~~~~
1 change: 1 addition & 0 deletions vlib/v/checker/tests/invalid_utf8_string.vv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
_ := '����txt'

0 comments on commit f7841e1

Please sign in to comment.