checker: check and error for invalid utf8 string literals (#23721)

vlang · Feb 15, 2025 · f7841e1 · f7841e1
1 parent b418230
commit f7841e1
Show file tree

Hide file tree

Showing 8 changed files with 112 additions and 90 deletions.
diff --git a/cmd/tools/vtest-self.v b/cmd/tools/vtest-self.v
@@ -50,7 +50,7 @@ const essential_list = [
 	'vlib/crypto/md5/md5_test.v',
 	'vlib/dl/dl_test.v',
 	'vlib/encoding/base64/base64_test.v',
-	'vlib/encoding/utf8/encoding_utf8_test.v',
+	'vlib/encoding/utf8/validate/encoding_utf8_test.v',
 	'vlib/encoding/utf8/utf8_util_test.v',
 	'vlib/flag/flag_test.v',
 	'vlib/json/tests/json_decode_test.v',

diff --git a/vlib/encoding/utf8/encoding_utf8_test.v b/vlib/encoding/utf8/encoding_utf8_test.v
diff --git a/vlib/encoding/utf8/utf8.v b/vlib/encoding/utf8/utf8.v
@@ -1,90 +1,13 @@
 module utf8
 
-struct Utf8State {
-mut:
-	index    int
-	subindex int
-	failed   bool
-}
+import encoding.utf8.validate as impl
 
 // validate_str reports if str consists of valid UTF-8 runes
 pub fn validate_str(str string) bool {
-	return validate(str.str, str.len)
+	return impl.utf8_string(str)
 }
 
 // validate reports if data consists of valid UTF-8 runes
 pub fn validate(data &u8, len int) bool {
-	mut state := Utf8State{}
-	for i := 0; i < len; i++ {
-		s := unsafe { data[i] }
-		if s == 0 {
-			break
-		}
-		state.next_state(s)
-		if state.failed {
-			return false
-		}
-	}
-	return !state.failed && state.subindex <= 0
-}
-
-fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool {
-	if s.subindex == 0 || (s.index > 1 && s.subindex == 1) || (s.index >= 6 && s.subindex == 2) {
-		if (s.subindex == 0 && r0) || (s.subindex == 1 && r1) || (s.subindex == 2 && is_tail) {
-			s.subindex++
-			return true
-		}
-	} else {
-		s.failed = true
-		if is_tail {
-			s.index = 0
-			s.subindex = 0
-			s.failed = false
-		}
-		return true
-	}
-	s.index++
-	s.subindex = 0
-	return false
-}
-
-fn (mut s Utf8State) next_state(c u8) {
-	// sequence 1
-	if s.index == 0 {
-		if (c >= 0x00 + 1 && c <= 0x7F) || c == 0x00 {
-			return
-		}
-		s.index++
-		s.subindex = 0
-	}
-	is_tail := c >= 0x80 && c <= 0xBF
-	// sequence 2
-	if s.index == 1 && s.seq(c >= 0xC2 && c <= 0xDF, false, is_tail) {
-		return
-	}
-	// sequence 3
-	if s.index == 2 && s.seq(c == 0xE0, c >= 0xA0 && c <= 0xBF, is_tail) {
-		return
-	}
-	if s.index == 3 && s.seq(c >= 0xE1 && c <= 0xEC, c >= 0x80 && c <= 0xBF, is_tail) {
-		return
-	}
-	if s.index == 4 && s.seq(c == 0xED, c >= 0x80 && c <= 0x9F, is_tail) {
-		return
-	}
-	if s.index == 5 && s.seq(c >= 0xEE && c <= 0xEF, c >= 0x80 && c <= 0xBF, is_tail) {
-		return
-	}
-	// sequence 4
-	if s.index == 6 && s.seq(c == 0xF0, c >= 0x90 && c <= 0xBF, is_tail) {
-		return
-	}
-	if s.index == 7 && s.seq(c >= 0xF1 && c <= 0xF3, c >= 0x80 && c <= 0xBF, is_tail) {
-		return
-	}
-	if s.index == 8 && s.seq(c == 0xF4, c >= 0x80 && c <= 0x8F, is_tail) {
-		return
-	}
-	// we should never reach here
-	s.failed = true
+	return impl.utf8_data(data, len)
 }
diff --git a/vlib/encoding/utf8/validate/encoding_utf8_test.v b/vlib/encoding/utf8/validate/encoding_utf8_test.v
@@ -0,0 +1,9 @@
+import encoding.utf8.validate
+
+fn test_validate_str() {
+	assert validate.utf8_string('añçá') == true
+	assert validate.utf8_string('\x61\xC3\xB1\xC3\xA7\xC3\xA1') == true
+	assert validate.utf8_string('\xC0\xC1') == false
+	assert validate.utf8_string('\xF5\xFF') == false
+	assert validate.utf8_string('\xE0\xEF') == false
+}
diff --git a/vlib/encoding/utf8/validate/validate_utf8.v b/vlib/encoding/utf8/validate/validate_utf8.v
@@ -0,0 +1,90 @@
+module validate
+
+struct Utf8State {
+mut:
+	index    int
+	subindex int
+	failed   bool
+}
+
+// utf8_string returns true, if the given string `s` consists only of valid UTF-8 runes
+pub fn utf8_string(s string) bool {
+	return utf8_data(s.str, s.len)
+}
+
+// utf8_data returns true, if the given `data` block, with length `len` bytes, consists only of valid UTF-8 runes
+pub fn utf8_data(data &u8, len int) bool {
+	mut state := Utf8State{}
+	for i := 0; i < len; i++ {
+		s := unsafe { data[i] }
+		if s == 0 {
+			break
+		}
+		state.next_state(s)
+		if state.failed {
+			return false
+		}
+	}
+	return !state.failed && state.subindex <= 0
+}
+
+fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool {
+	if s.subindex == 0 || (s.index > 1 && s.subindex == 1) || (s.index >= 6 && s.subindex == 2) {
+		if (s.subindex == 0 && r0) || (s.subindex == 1 && r1) || (s.subindex == 2 && is_tail) {
+			s.subindex++
+			return true
+		}
+	} else {
+		s.failed = true
+		if is_tail {
+			s.index = 0
+			s.subindex = 0
+			s.failed = false
+		}
+		return true
+	}
+	s.index++
+	s.subindex = 0
+	return false
+}
+
+fn (mut s Utf8State) next_state(c u8) {
+	// sequence 1
+	if s.index == 0 {
+		if (c >= 0x00 + 1 && c <= 0x7F) || c == 0x00 {
+			return
+		}
+		s.index++
+		s.subindex = 0
+	}
+	is_tail := c >= 0x80 && c <= 0xBF
+	// sequence 2
+	if s.index == 1 && s.seq(c >= 0xC2 && c <= 0xDF, false, is_tail) {
+		return
+	}
+	// sequence 3
+	if s.index == 2 && s.seq(c == 0xE0, c >= 0xA0 && c <= 0xBF, is_tail) {
+		return
+	}
+	if s.index == 3 && s.seq(c >= 0xE1 && c <= 0xEC, c >= 0x80 && c <= 0xBF, is_tail) {
+		return
+	}
+	if s.index == 4 && s.seq(c == 0xED, c >= 0x80 && c <= 0x9F, is_tail) {
+		return
+	}
+	if s.index == 5 && s.seq(c >= 0xEE && c <= 0xEF, c >= 0x80 && c <= 0xBF, is_tail) {
+		return
+	}
+	// sequence 4
+	if s.index == 6 && s.seq(c == 0xF0, c >= 0x90 && c <= 0xBF, is_tail) {
+		return
+	}
+	if s.index == 7 && s.seq(c >= 0xF1 && c <= 0xF3, c >= 0x80 && c <= 0xBF, is_tail) {
+		return
+	}
+	if s.index == 8 && s.seq(c == 0xF4, c >= 0x80 && c <= 0x8F, is_tail) {
+		return
+	}
+	// we should never reach here
+	s.failed = true
+}
diff --git a/vlib/v/checker/str.v b/vlib/v/checker/str.v
@@ -5,6 +5,7 @@ module checker
 
 import v.ast
 import v.token
+import encoding.utf8.validate
 
 fn (mut c Checker) get_default_fmt(ftyp ast.Type, typ ast.Type) u8 {
 	if ftyp.has_option_or_result() {
@@ -131,6 +132,10 @@ const unicode_lit_overflow_message = 'unicode character exceeds max allowed valu
 // https://stackoverflow.com/questions/52203351/why-unicode-is-restricted-to-0x10ffff
 @[direct_array_access]
 fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type {
+	valid_utf8 := validate.utf8_string(node.val)
+	if !valid_utf8 {
+		c.warn("invalid utf8 string, please check your file's encoding is utf8", node.pos)
+	}
 	mut idx := 0
 	for idx < node.val.len {
 		match node.val[idx] {

diff --git a/vlib/v/checker/tests/invalid_utf8_string.out b/vlib/v/checker/tests/invalid_utf8_string.out
@@ -0,0 +1,3 @@
+vlib/v/checker/tests/invalid_utf8_string.vv:1:6: warning: invalid utf8 string, please check your file's encoding is utf8
+    1 | _ := '����txt'
+      |      ~~~~~~~
diff --git a/vlib/v/checker/tests/invalid_utf8_string.vv b/vlib/v/checker/tests/invalid_utf8_string.vv
@@ -0,0 +1 @@
+_ := '����txt'