Skip to content

Commit 8a69862

Browse files
Mottlfriendlymatthew
authored andcommitted
fix: Data type inference for NaN, inf and -inf in csv files (apache#7150)
* fix: Data type inference for NaN, inf and -inf in csv files * Adds tests for NaN, inf and -inf Float64 values * Adds python-style NaN
1 parent e50bcbc commit 8a69862

File tree

2 files changed

+12
-3
lines changed

2 files changed

+12
-3
lines changed

arrow-csv/src/reader/mod.rs

+8-2
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,8 @@ impl InferredDataType {
221221
} else {
222222
1 << m
223223
}
224+
} else if string == "NaN" || string == "nan" || string == "inf" || string == "-inf" {
225+
1 << 2 // Float64
224226
} else {
225227
1 << 8 // Utf8
226228
}
@@ -1659,7 +1661,7 @@ mod tests {
16591661
let mut csv = builder.build(file).unwrap();
16601662
let batch = csv.next().unwrap().unwrap();
16611663

1662-
assert_eq!(7, batch.num_rows());
1664+
assert_eq!(10, batch.num_rows());
16631665
assert_eq!(6, batch.num_columns());
16641666

16651667
let schema = batch.schema();
@@ -1803,6 +1805,10 @@ mod tests {
18031805
assert_eq!(infer_field_schema("10.2"), DataType::Float64);
18041806
assert_eq!(infer_field_schema(".2"), DataType::Float64);
18051807
assert_eq!(infer_field_schema("2."), DataType::Float64);
1808+
assert_eq!(infer_field_schema("NaN"), DataType::Float64);
1809+
assert_eq!(infer_field_schema("nan"), DataType::Float64);
1810+
assert_eq!(infer_field_schema("inf"), DataType::Float64);
1811+
assert_eq!(infer_field_schema("-inf"), DataType::Float64);
18061812
assert_eq!(infer_field_schema("true"), DataType::Boolean);
18071813
assert_eq!(infer_field_schema("trUe"), DataType::Boolean);
18081814
assert_eq!(infer_field_schema("false"), DataType::Boolean);
@@ -2372,7 +2378,7 @@ mod tests {
23722378
fn test_buffered() {
23732379
let tests = [
23742380
("test/data/uk_cities.csv", false, 37),
2375-
("test/data/various_types.csv", true, 7),
2381+
("test/data/various_types.csv", true, 10),
23762382
("test/data/decimal_test.csv", false, 10),
23772383
];
23782384

arrow-csv/test/data/various_types.csv

+4-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,7 @@ c_int|c_float|c_string|c_bool|c_date|c_datetime
55
4|4.4||false||
66
5|6.6|""|false|1990-01-01|1990-01-01T03:00:00
77
4|4e6||false||
8-
4|4.0e-6||false||
8+
4|4.0e-6||false||
9+
6|NaN||false||
10+
7|inf||false||
11+
8|-inf||false||

0 commit comments

Comments
 (0)