Skip to content

Commit

Permalink
Parse null bitmaps (#54)
Browse files Browse the repository at this point in the history
* Parse null bitmaps

* Add test with null bitmap
kylebarron authored Oct 31, 2023
1 parent 6abe5fc commit 428ce94
Showing 5 changed files with 194 additions and 21 deletions.
135 changes: 115 additions & 20 deletions src/vector.ts
Original file line number Diff line number Diff line change
@@ -64,7 +64,12 @@ export function parseData<T extends DataType>(

if (DataType.isInt(dataType)) {
const [validityPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);
const byteLength = (length * dataType.bitWidth) / 8;
const data = copy
? new dataType.ArrayType(copyBuffer(dataView.buffer, dataPtr, byteLength))
@@ -81,7 +86,12 @@ export function parseData<T extends DataType>(

if (DataType.isFloat(dataType)) {
const [validityPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);
// bitwidth doesn't exist on float types I guess
const byteLength = length * dataType.ArrayType.BYTES_PER_ELEMENT;
const data = copy
@@ -99,7 +109,12 @@ export function parseData<T extends DataType>(

if (DataType.isBool(dataType)) {
const [validityPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

// Boolean arrays are bit-packed. This means the byte length should be the number of elements,
// rounded up to the nearest byte to account for the remainder.
@@ -120,7 +135,12 @@ export function parseData<T extends DataType>(

if (DataType.isDecimal(dataType)) {
const [validityPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);
const byteLength = (length * dataType.bitWidth) / 8;
const data = copy
? new dataType.ArrayType(copyBuffer(dataView.buffer, dataPtr, byteLength))
@@ -137,7 +157,12 @@ export function parseData<T extends DataType>(

if (DataType.isDate(dataType)) {
const [validityPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

let byteWidth = getDateByteWidth(dataType);
const data = copy
@@ -157,7 +182,12 @@ export function parseData<T extends DataType>(

if (DataType.isTime(dataType)) {
const [validityPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);
const byteLength = (length * dataType.bitWidth) / 8;
const data = copy
? new dataType.ArrayType(copyBuffer(dataView.buffer, dataPtr, byteLength))
@@ -174,7 +204,12 @@ export function parseData<T extends DataType>(

if (DataType.isTimestamp(dataType)) {
const [validityPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

let byteWidth = getTimeByteWidth(dataType);
const data = copy
@@ -194,7 +229,12 @@ export function parseData<T extends DataType>(

if (DataType.isInterval(dataType)) {
const [validityPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

// What's the bitwidth here?
if (copy) {
@@ -215,7 +255,12 @@ export function parseData<T extends DataType>(

if (DataType.isBinary(dataType)) {
const [validityPtr, offsetsPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

const valueOffsets = copy
? new Int32Array(
@@ -247,7 +292,12 @@ export function parseData<T extends DataType>(

if (isLargeBinary(dataType)) {
const [validityPtr, offsetsPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

// The original value offsets are an Int64Array, which Arrow JS does not yet support natively
const originalValueOffsets = new BigInt64Array(
@@ -285,7 +335,12 @@ export function parseData<T extends DataType>(

if (DataType.isUtf8(dataType)) {
const [validityPtr, offsetsPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

const valueOffsets = copy
? new Int32Array(
@@ -317,7 +372,12 @@ export function parseData<T extends DataType>(

if (isLargeUtf8(dataType)) {
const [validityPtr, offsetsPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

// The original value offsets are an Int64Array, which Arrow JS does not yet support natively
const originalValueOffsets = new BigInt64Array(
@@ -355,7 +415,12 @@ export function parseData<T extends DataType>(

if (DataType.isFixedSizeBinary(dataType)) {
const [validityPtr, dataPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);
const data = copy
? new dataType.ArrayType(
copyBuffer(dataView.buffer, dataPtr, length * dataType.byteWidth)
@@ -378,7 +443,12 @@ export function parseData<T extends DataType>(
if (DataType.isList(dataType)) {
assert(nChildren === 1);
const [validityPtr, offsetsPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);
const valueOffsets = copy
? new Int32Array(
copyBuffer(
@@ -404,7 +474,12 @@ export function parseData<T extends DataType>(
dataType;
assert(nChildren === 1);
const [validityPtr, offsetsPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

// The original value offsets are an Int64Array, which Arrow JS does not yet support natively
const originalValueOffsets = new BigInt64Array(
@@ -435,7 +510,12 @@ export function parseData<T extends DataType>(
if (DataType.isFixedSizeList(dataType)) {
assert(nChildren === 1);
const [validityPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

return arrow.makeData({
type: dataType,
@@ -449,7 +529,12 @@ export function parseData<T extends DataType>(

if (DataType.isStruct(dataType)) {
const [validityPtr] = bufferPtrs;
const nullBitmap = parseNullBitmap(dataView.buffer, validityPtr, copy);
const nullBitmap = parseNullBitmap(
dataView.buffer,
validityPtr,
length,
copy
);

return arrow.makeData({
type: dataType,
@@ -490,11 +575,21 @@ function getTimeByteWidth(type: arrow.Time | arrow.Timestamp): number {
function parseNullBitmap(
buffer: ArrayBuffer,
validityPtr: number,
length: number,
copy: boolean
): NullBitmap {
// TODO: parse validity bitmaps
const nullBitmap = validityPtr === 0 ? null : null;
return nullBitmap;
if (validityPtr === 0) {
return null;
}

// Each value takes up one bit
const byteLength = (length >> 3) + 1;

if (copy) {
return new Uint8Array(copyBuffer(buffer, validityPtr, byteLength));
} else {
return new Uint8Array(buffer, validityPtr, byteLength);
}
}

/** Copy existing buffer into new buffer */
41 changes: 40 additions & 1 deletion tests/ffi.test.ts
Original file line number Diff line number Diff line change
@@ -2,7 +2,12 @@ import { readFileSync } from "fs";
import { describe, it, expect } from "vitest";
import * as arrow from "apache-arrow";
import * as wasm from "rust-arrow-ffi";
import { arrowTableToFFI, arraysEqual, loadIPCTableFromDisk } from "./utils";
import {
arrowTableToFFI,
arraysEqual,
loadIPCTableFromDisk,
validityEqual,
} from "./utils";
import { parseField, parseVector } from "../src";
import { Type } from "../src/types";

@@ -622,3 +627,37 @@ describe("date32", (t) => {
// expect(originalVector.get(i), wasmVector.get(i));
// }
// });

describe("nullable int", (t) => {
function test(copy: boolean) {
let columnIndex = TEST_TABLE.schema.fields.findIndex(
(field) => field.name == "nullable_int"
);

const originalField = TEST_TABLE.schema.fields[columnIndex];
// declare it's not null
const originalVector = TEST_TABLE.getChildAt(columnIndex) as arrow.Vector;
const fieldPtr = FFI_TABLE.schemaAddr(columnIndex);
const field = parseField(WASM_MEMORY.buffer, fieldPtr);

expect(field.name).toStrictEqual(originalField.name);
expect(field.typeId).toStrictEqual(originalField.typeId);
expect(field.nullable).toStrictEqual(originalField.nullable);

const arrayPtr = FFI_TABLE.arrayAddr(0, columnIndex);
const wasmVector = parseVector(
WASM_MEMORY.buffer,
arrayPtr,
field.type,
copy
);

expect(
validityEqual(originalVector, wasmVector),
"validity should be equal"
).toBeTruthy();
}

it("copy=false", () => test(false));
it("copy=true", () => test(true));
});
10 changes: 10 additions & 0 deletions tests/pyarrow_generate_data.py
Original file line number Diff line number Diff line change
@@ -115,6 +115,15 @@ def timestamp_array() -> pa.Array:
return arr


def nullable_int() -> pa.Array:
# True means null
mask = [True, False, True]
arr = pa.array([1, 2, 3], type=pa.uint8(), mask=mask)
assert isinstance(arr, pa.UInt8Array)
assert not arr[0].is_valid
return arr


class MyExtensionType(pa.ExtensionType):
"""
Refer to https://arrow.apache.org/docs/python/extending_types.html for
@@ -160,6 +169,7 @@ def table() -> pa.Table:
"date32": date32_array(),
"date64": date64_array(),
"timestamp": timestamp_array(),
"nullable_int": nullable_int(),
}
)

Binary file modified tests/table.arrow
Binary file not shown.
29 changes: 29 additions & 0 deletions tests/utils.ts
Original file line number Diff line number Diff line change
@@ -35,3 +35,32 @@ export function arraysEqual<T>(

return true;
}

export function validityEqual(v1: arrow.Vector, v2: arrow.Vector): boolean {
if (v1.length !== v2.length) {
return false;
}

if (v1.data.length !== v2.data.length) {
console.log("todo: support different data lengths");
return false;
}
for (let i = 0; i < v1.data.length; i++) {
const d1 = v1.data[i];
const d2 = v2.data[i];
// Check that null bitmaps have same length
if (d1 !== null && d2 !== null) {
if (d1.nullBitmap.length !== d2.nullBitmap.length) {
return false;
}
}
}

for (let i = 0; i < v1.length; i++) {
if (v1.isValid(i) !== v2.isValid(i)) {
return false;
}
}

return true;
}

0 comments on commit 428ce94

Please sign in to comment.