diff --git a/includes/acl/compression/impl/normalize_streams.h b/includes/acl/compression/impl/normalize_streams.h index a64161d5..dedd2eff 100644 --- a/includes/acl/compression/impl/normalize_streams.h +++ b/includes/acl/compression/impl/normalize_streams.h @@ -100,28 +100,114 @@ namespace acl { const rtm::vector4f one = rtm::vector_set(1.0F); const rtm::vector4f zero = rtm::vector_zero(); + +#ifdef ACL_PRECISION_BOOST + + const float max_range_value_flt = float((1 << k_segment_range_reduction_num_bits_per_component) - 2); + +#else + const float max_range_value_flt = float((1 << k_segment_range_reduction_num_bits_per_component) - 1); + +#endif + const rtm::vector4f max_range_value = rtm::vector_set(max_range_value_flt); const rtm::vector4f inv_max_range_value = rtm::vector_set(1.0F / max_range_value_flt); +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f half = rtm::vector_set(0.5F); + const rtm::vector4f half_neg = rtm::vector_set(-0.5F); + const rtm::vector4f half_range_value = rtm::vector_set(float((1 << (k_segment_range_reduction_num_bits_per_component - 1)) - 1)); + // Segment ranges are always normalized and live between [-0.5 ... 0.5] + +#else + // Segment ranges are always normalized and live between [0.0 ... 1.0] +#endif + auto fixup_range = [&](const track_stream_range& range) { + + +#ifdef ACL_PRECISION_BOOST + + // In our compressed format, we store the center value of the track range quantized on 8 bits. + // To get the best accuracy, we pick the values closest to the true minimum that is slightly lower, and true maximum that is slightly higher. + // This is to ensure that we encompass the lowest and highest values even after quantization. + const rtm::vector4f range_min = range.get_min(); + const rtm::vector4f quantized_min0 = rtm::vector_clamp(rtm::vector_add(rtm::vector_floor(rtm::vector_mul_add(range_min, max_range_value, half)), half_range_value), zero, max_range_value); + +#else + // In our compressed format, we store the minimum value of the track range quantized on 8 bits. // To get the best accuracy, we pick the value closest to the true minimum that is slightly lower. // This is to ensure that we encompass the lowest value even after quantization. const rtm::vector4f range_min = range.get_min(); const rtm::vector4f scaled_min = rtm::vector_mul(range_min, max_range_value); const rtm::vector4f quantized_min0 = rtm::vector_clamp(rtm::vector_floor(scaled_min), zero, max_range_value); + +#endif + const rtm::vector4f quantized_min1 = rtm::vector_max(rtm::vector_sub(quantized_min0, one), zero); +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f padded_range_min0 = rtm::vector_mul_add(quantized_min0, inv_max_range_value, half_neg); + +#else + const rtm::vector4f padded_range_min0 = rtm::vector_mul(quantized_min0, inv_max_range_value); const rtm::vector4f padded_range_min1 = rtm::vector_mul(quantized_min1, inv_max_range_value); +#endif + // Check if min0 is below or equal to our original range minimum value, if it is, it is good // enough to use otherwise min1 is guaranteed to be lower. const rtm::mask4f is_min0_lower_mask = rtm::vector_less_equal(padded_range_min0, range_min); + +#ifdef ACL_PRECISION_BOOST + + rtm::vector4f quantized_min = rtm::vector_select(is_min0_lower_mask, quantized_min0, quantized_min1); + + const rtm::vector4f range_max = range.get_max(); + const rtm::vector4f quantized_max0 = rtm::vector_clamp(rtm::vector_add(rtm::vector_floor(rtm::vector_mul_add(range_max, max_range_value, half)), half_range_value), zero, max_range_value); + const rtm::vector4f quantized_max1 = rtm::vector_min(rtm::vector_add(quantized_max0, one), max_range_value); + const rtm::vector4f padded_range_max0 = rtm::vector_mul_add(quantized_max0, inv_max_range_value, half_neg); + const rtm::mask4f is_max0_higher_mask = rtm::vector_greater_equal(padded_range_max0, range_max); + rtm::vector4f quantized_max = rtm::vector_select(is_max0_higher_mask, quantized_max0, quantized_max1); + + // Finally, we pad the range further so the midpoint has minimal quantization error. + const rtm::vector4f quantized_extent = rtm::vector_sub(quantized_max, quantized_min); + union MaskVector + { + constexpr explicit MaskVector(rtm::mask4f_arg0 mask_arg) : mask(mask_arg) {} + rtm::mask4f mask; + rtm::vector4f vector; + }; + union VectorMask + { + constexpr explicit VectorMask(rtm::vector4f_arg0 vector_arg) : vector(vector_arg) {} + rtm::vector4f vector; + rtm::mask4f mask; + }; + const MaskVector is_odd(rtm::mask_set( + (static_cast(rtm::vector_get_x(quantized_extent)) & 1) != 0, + (static_cast(rtm::vector_get_y(quantized_extent)) & 1) != 0, + (static_cast(rtm::vector_get_z(quantized_extent)) & 1) != 0, + (static_cast(rtm::vector_get_w(quantized_extent)) & 1) != 0)); + const MaskVector prepend(rtm::vector_greater_equal(quantized_min, rtm::vector_sub(max_range_value, quantized_max))); + const MaskVector all(rtm::mask_set(true, true, true, true)); + quantized_min = rtm::vector_select(VectorMask(rtm::vector_and(is_odd.vector, prepend.vector)).mask, rtm::vector_sub(quantized_min, one), quantized_min); + quantized_max = rtm::vector_select(VectorMask(rtm::vector_and(is_odd.vector, rtm::vector_xor(prepend.vector, all.vector))).mask, rtm::vector_add(quantized_max, one), quantized_max); + ACL_ASSERT(rtm::vector_all_greater_equal(quantized_min, zero), "Min too low."); + ACL_ASSERT(rtm::vector_all_greater_equal(quantized_max, quantized_min), "Max too low."); + ACL_ASSERT(rtm::vector_all_greater_equal(max_range_value, quantized_max), "Max too high."); + return track_stream_range::from_min_max(rtm::vector_mul_add(quantized_min, inv_max_range_value, half_neg), rtm::vector_mul_add(quantized_max, inv_max_range_value, half_neg)); + +#else + const rtm::vector4f padded_range_min = rtm::vector_select(is_min0_lower_mask, padded_range_min0, padded_range_min1); // The story is different for the extent. We do not store the max, instead we use the extent @@ -144,6 +230,9 @@ namespace acl const rtm::vector4f padded_range_extent = rtm::vector_select(is_extent0_higher_mask, padded_range_extent0, padded_range_extent1); return track_stream_range::from_min_extent(padded_range_min, padded_range_extent); + +#endif + }; for (segment_context& segment : context.segment_iterator()) @@ -171,21 +260,54 @@ namespace acl inline rtm::vector4f RTM_SIMD_CALL normalize_sample(rtm::vector4f_arg0 sample, const track_stream_range& range) { + +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_center = range.get_center(); + +#else + const rtm::vector4f range_min = range.get_min(); + +#endif + const rtm::vector4f range_extent = range.get_extent(); const rtm::mask4f is_range_zero_mask = rtm::vector_less_than(range_extent, rtm::vector_set(0.000000001F)); +#ifdef ACL_PRECISION_BOOST + + rtm::vector4f normalized_sample = rtm::vector_div(rtm::vector_sub(sample, range_center), range_extent); + // Clamp because the division might be imprecise + const rtm::vector4f half_neg = rtm::vector_set(-0.5F); + normalized_sample = rtm::vector_clamp(normalized_sample, half_neg, rtm::vector_set(0.5F)); + return rtm::vector_select(is_range_zero_mask, half_neg, normalized_sample); + +#else + rtm::vector4f normalized_sample = rtm::vector_div(rtm::vector_sub(sample, range_min), range_extent); // Clamp because the division might be imprecise normalized_sample = rtm::vector_min(normalized_sample, rtm::vector_set(1.0F)); return rtm::vector_select(is_range_zero_mask, rtm::vector_zero(), normalized_sample); + +#endif + } inline void normalize_rotation_streams(transform_streams* bone_streams, const transform_range* bone_ranges, uint32_t num_bones) { + +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f half = rtm::vector_set(0.5F); + const rtm::vector4f half_neg = rtm::vector_set(-0.5F); + +#else + const rtm::vector4f one = rtm::vector_set(1.0F); const rtm::vector4f zero = rtm::vector_zero(); +#endif + for (uint32_t bone_index = 0; bone_index < num_bones; ++bone_index) { transform_streams& bone_stream = bone_streams[bone_index]; @@ -200,30 +322,83 @@ namespace acl const uint32_t num_samples = bone_stream.rotations.get_num_samples(); +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_center = bone_range.rotation.get_center(); + +#else + const rtm::vector4f range_min = bone_range.rotation.get_min(); + +#endif + const rtm::vector4f range_extent = bone_range.rotation.get_extent(); const rtm::mask4f is_range_zero_mask = rtm::vector_less_than(range_extent, rtm::vector_set(0.000000001F)); for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index) { + +#ifdef ACL_PRECISION_BOOST + + // normalized value is between [-0.5 .. 0.5] + // value = (normalized value * range extent) + range center + // normalized value = (value - range center) / range extent + +#else + // normalized value is between [0.0 .. 1.0] // value = (normalized value * range extent) + range min // normalized value = (value - range min) / range extent + +#endif + const rtm::vector4f rotation = bone_stream.rotations.get_raw_sample(sample_index); + +#ifdef ACL_PRECISION_BOOST + + rtm::vector4f normalized_rotation = rtm::vector_div(rtm::vector_sub(rotation, range_center), range_extent); + // Clamp because the division might be imprecise + normalized_rotation = rtm::vector_clamp(normalized_rotation, half_neg, half); + normalized_rotation = rtm::vector_select(is_range_zero_mask, half_neg, normalized_rotation); + +#else + rtm::vector4f normalized_rotation = rtm::vector_div(rtm::vector_sub(rotation, range_min), range_extent); // Clamp because the division might be imprecise normalized_rotation = rtm::vector_min(normalized_rotation, one); normalized_rotation = rtm::vector_select(is_range_zero_mask, zero, normalized_rotation); +#endif + #if defined(ACL_HAS_ASSERT_CHECKS) switch (bone_stream.rotations.get_rotation_format()) { case rotation_format8::quatf_full: + +#ifdef ACL_PRECISION_BOOST + + ACL_ASSERT(rtm::vector_all_greater_equal(normalized_rotation, half_neg) && rtm::vector_all_less_equal(normalized_rotation, half), "Invalid normalized rotation. -0.5 <= [%f, %f, %f, %f] <= 0.5", (float)rtm::vector_get_x(normalized_rotation), (float)rtm::vector_get_y(normalized_rotation), (float)rtm::vector_get_z(normalized_rotation), (float)rtm::vector_get_w(normalized_rotation)); + +#else + ACL_ASSERT(rtm::vector_all_greater_equal(normalized_rotation, zero) && rtm::vector_all_less_equal(normalized_rotation, one), "Invalid normalized rotation. 0.0 <= [%f, %f, %f, %f] <= 1.0", (float)rtm::vector_get_x(normalized_rotation), (float)rtm::vector_get_y(normalized_rotation), (float)rtm::vector_get_z(normalized_rotation), (float)rtm::vector_get_w(normalized_rotation)); + +#endif + break; case rotation_format8::quatf_drop_w_full: case rotation_format8::quatf_drop_w_variable: + +#ifdef ACL_PRECISION_BOOST + + ACL_ASSERT(rtm::vector_all_greater_equal3(normalized_rotation, half_neg) && rtm::vector_all_less_equal3(normalized_rotation, half), "Invalid normalized rotation. -0.5 <= [%f, %f, %f] <= 0.5", (float)rtm::vector_get_x(normalized_rotation), (float)rtm::vector_get_y(normalized_rotation), (float)rtm::vector_get_z(normalized_rotation)); + +#else + ACL_ASSERT(rtm::vector_all_greater_equal3(normalized_rotation, zero) && rtm::vector_all_less_equal3(normalized_rotation, one), "Invalid normalized rotation. 0.0 <= [%f, %f, %f] <= 1.0", (float)rtm::vector_get_x(normalized_rotation), (float)rtm::vector_get_y(normalized_rotation), (float)rtm::vector_get_z(normalized_rotation)); + +#endif + break; } #endif @@ -235,9 +410,19 @@ namespace acl inline void normalize_translation_streams(transform_streams* bone_streams, const transform_range* bone_ranges, uint32_t num_bones) { + +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f half = rtm::vector_set(0.5F); + const rtm::vector4f half_neg = rtm::vector_set(-0.5F); + +#else + const rtm::vector4f one = rtm::vector_set(1.0F); const rtm::vector4f zero = rtm::vector_zero(); +#endif + for (uint32_t bone_index = 0; bone_index < num_bones; ++bone_index) { transform_streams& bone_stream = bone_streams[bone_index]; @@ -252,16 +437,49 @@ namespace acl const uint32_t num_samples = bone_stream.translations.get_num_samples(); +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_center = bone_range.translation.get_center(); + +#else + const rtm::vector4f range_min = bone_range.translation.get_min(); + +#endif + const rtm::vector4f range_extent = bone_range.translation.get_extent(); const rtm::mask4f is_range_zero_mask = rtm::vector_less_than(range_extent, rtm::vector_set(0.000000001F)); for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index) { + +#ifdef ACL_PRECISION_BOOST + + // normalized value is between [-0.5 .. 0.5] + // value = (normalized value * range extent) + range center + // normalized value = (value - range center) / range extent + +#else + // normalized value is between [0.0 .. 1.0] // value = (normalized value * range extent) + range min // normalized value = (value - range min) / range extent + +#endif + const rtm::vector4f translation = bone_stream.translations.get_raw_sample(sample_index); + +#ifdef ACL_PRECISION_BOOST + + rtm::vector4f normalized_translation = rtm::vector_div(rtm::vector_sub(translation, range_center), range_extent); + // Clamp because the division might be imprecise + normalized_translation = rtm::vector_clamp(normalized_translation, half_neg, half); + normalized_translation = rtm::vector_select(is_range_zero_mask, half_neg, normalized_translation); + + ACL_ASSERT(rtm::vector_all_greater_equal3(normalized_translation, half_neg) && rtm::vector_all_less_equal3(normalized_translation, half), "Invalid normalized translation. -0.5 <= [%f, %f, %f] <= 0.5", (float)rtm::vector_get_x(normalized_translation), (float)rtm::vector_get_y(normalized_translation), (float)rtm::vector_get_z(normalized_translation)); + +#else + rtm::vector4f normalized_translation = rtm::vector_div(rtm::vector_sub(translation, range_min), range_extent); // Clamp because the division might be imprecise normalized_translation = rtm::vector_min(normalized_translation, one); @@ -269,6 +487,8 @@ namespace acl ACL_ASSERT(rtm::vector_all_greater_equal3(normalized_translation, zero) && rtm::vector_all_less_equal3(normalized_translation, one), "Invalid normalized translation. 0.0 <= [%f, %f, %f] <= 1.0", (float)rtm::vector_get_x(normalized_translation), (float)rtm::vector_get_y(normalized_translation), (float)rtm::vector_get_z(normalized_translation)); +#endif + bone_stream.translations.set_raw_sample(sample_index, normalized_translation); } } @@ -276,9 +496,19 @@ namespace acl inline void normalize_scale_streams(transform_streams* bone_streams, const transform_range* bone_ranges, uint32_t num_bones) { + +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f half = rtm::vector_set(0.5F); + const rtm::vector4f half_neg = rtm::vector_set(-0.5F); + +#else + const rtm::vector4f one = rtm::vector_set(1.0F); const rtm::vector4f zero = rtm::vector_zero(); +#endif + for (uint32_t bone_index = 0; bone_index < num_bones; ++bone_index) { transform_streams& bone_stream = bone_streams[bone_index]; @@ -293,16 +523,49 @@ namespace acl const uint32_t num_samples = bone_stream.scales.get_num_samples(); +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_center = bone_range.scale.get_center(); + +#else + const rtm::vector4f range_min = bone_range.scale.get_min(); + +#endif + const rtm::vector4f range_extent = bone_range.scale.get_extent(); const rtm::mask4f is_range_zero_mask = rtm::vector_less_than(range_extent, rtm::vector_set(0.000000001F)); for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index) { + +#ifdef ACL_PRECISION_BOOST + + // normalized value is between [-0.5 .. 0.5] + // value = (normalized value * range extent) + range center + // normalized value = (value - range center) / range extent + +#else + // normalized value is between [0.0 .. 1.0] // value = (normalized value * range extent) + range min // normalized value = (value - range min) / range extent + +#endif + const rtm::vector4f scale = bone_stream.scales.get_raw_sample(sample_index); + +#ifdef ACL_PRECISION_BOOST + + rtm::vector4f normalized_scale = rtm::vector_div(rtm::vector_sub(scale, range_center), range_extent); + // Clamp because the division might be imprecise + normalized_scale = rtm::vector_clamp(normalized_scale, half_neg, half); + normalized_scale = rtm::vector_select(is_range_zero_mask, half_neg, normalized_scale); + + ACL_ASSERT(rtm::vector_all_greater_equal3(normalized_scale, half_neg) && rtm::vector_all_less_equal3(normalized_scale, half), "Invalid normalized scale. -0.5 <= [%f, %f, %f] <= 0.5", (float)rtm::vector_get_x(normalized_scale), (float)rtm::vector_get_y(normalized_scale), (float)rtm::vector_get_z(normalized_scale)); + +#else + rtm::vector4f normalized_scale = rtm::vector_div(rtm::vector_sub(scale, range_min), range_extent); // Clamp because the division might be imprecise normalized_scale = rtm::vector_min(normalized_scale, one); @@ -310,6 +573,8 @@ namespace acl ACL_ASSERT(rtm::vector_all_greater_equal3(normalized_scale, zero) && rtm::vector_all_less_equal3(normalized_scale, one), "Invalid normalized scale. 0.0 <= [%f, %f, %f] <= 1.0", (float)rtm::vector_get_x(normalized_scale), (float)rtm::vector_get_y(normalized_scale), (float)rtm::vector_get_z(normalized_scale)); +#endif + bone_stream.scales.set_raw_sample(sample_index, normalized_scale); } } diff --git a/includes/acl/compression/impl/normalize_track_impl.h b/includes/acl/compression/impl/normalize_track_impl.h index 70752116..71e9aaf8 100644 --- a/includes/acl/compression/impl/normalize_track_impl.h +++ b/includes/acl/compression/impl/normalize_track_impl.h @@ -42,24 +42,65 @@ namespace acl { using namespace rtm; +#ifdef ACL_PRECISION_BOOST + + const vector4f half = rtm::vector_set(0.5F); + const vector4f half_neg = rtm::vector_set(-0.5F); + +#else + const vector4f one = rtm::vector_set(1.0F); const vector4f zero = vector_zero(); +#endif + track_vector4f& typed_track = track_cast(mut_track); const uint32_t num_samples = mut_track.get_num_samples(); +#ifdef ACL_PRECISION_BOOST + + const vector4f range_center = range.get_center(); + +#else const vector4f range_min = range.get_min(); + +#endif + const vector4f range_extent = range.get_extent(); const mask4f is_range_zero_mask = vector_less_than(range_extent, rtm::vector_set(0.000000001F)); for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index) { + +#ifdef ACL_PRECISION_BOOST + + // normalized value is between [-0.5 .. 0.5] + // value = (normalized value * range extent) + range center + // normalized value = (value - range center) / range extent + +#else + // normalized value is between [0.0 .. 1.0] // value = (normalized value * range extent) + range min // normalized value = (value - range min) / range extent + +#endif + const vector4f sample = typed_track[sample_index]; +#ifdef ACL_PRECISION_BOOST + + vector4f normalized_sample = vector_div(vector_sub(sample, range_center), range_extent); + + // Clamp because the division might be imprecise + normalized_sample = rtm::vector_clamp(normalized_sample, half_neg, half); + normalized_sample = rtm::vector_select(is_range_zero_mask, half_neg, normalized_sample); + + ACL_ASSERT(vector_all_greater_equal(normalized_sample, half_neg) && vector_all_less_equal(normalized_sample, half), "Invalid normalized value. -0.5 <= [%f, %f, %f, %f] <= 0.5", (float)vector_get_x(normalized_sample), (float)vector_get_y(normalized_sample), (float)vector_get_z(normalized_sample), (float)vector_get_w(normalized_sample)); + +#else + vector4f normalized_sample = vector_div(vector_sub(sample, range_min), range_extent); // Clamp because the division might be imprecise @@ -68,6 +109,8 @@ namespace acl ACL_ASSERT(vector_all_greater_equal(normalized_sample, zero) && vector_all_less_equal(normalized_sample, one), "Invalid normalized value. 0.0 <= [%f, %f, %f, %f] <= 1.0", (float)vector_get_x(normalized_sample), (float)vector_get_y(normalized_sample), (float)vector_get_z(normalized_sample), (float)vector_get_w(normalized_sample)); +#endif + typed_track[sample_index] = normalized_sample; } } diff --git a/includes/acl/compression/impl/quantize_streams.h b/includes/acl/compression/impl/quantize_streams.h index 40c707d2..d64d44f5 100644 --- a/includes/acl/compression/impl/quantize_streams.h +++ b/includes/acl/compression/impl/quantize_streams.h @@ -353,7 +353,17 @@ namespace acl const rtm::vector4f normalized_rotation = normalize_sample(rotation, clip_range); uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(0); + +#ifdef ACL_PRECISION_BOOST + + pack_vector3_sn48_unsafe_precise_endpoints(normalized_rotation, quantized_ptr); + +#else + pack_vector3_u48_unsafe(normalized_rotation, quantized_ptr); + +#endif + } else { @@ -372,7 +382,17 @@ namespace acl else { const rtm::quatf rotation = raw_segment_stream.get_raw_sample(sample_index); + +#ifdef ACL_PRECISION_BOOST + + pack_vector3_snXX_unsafe(rtm::quat_to_vector(rotation), num_bits_at_bit_rate, quantized_ptr); + +#else + pack_vector3_uXX_unsafe(rtm::quat_to_vector(rotation), num_bits_at_bit_rate, quantized_ptr); + +#endif + } } } @@ -465,7 +485,17 @@ namespace acl const rtm::vector4f normalized_translation = normalize_sample(translation, clip_range); uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(0); + +#ifdef ACL_PRECISION_BOOST + + pack_vector3_sn48_unsafe_precise_endpoints(normalized_translation, quantized_ptr); + +#else + pack_vector3_u48_unsafe(normalized_translation, quantized_ptr); + +#endif + } else { @@ -483,7 +513,17 @@ namespace acl else { const rtm::vector4f translation = raw_segment_stream.get_raw_sample(sample_index); + +#ifdef ACL_PRECISION_BOOST + + pack_vector3_snXX_unsafe(translation, num_bits_at_bit_rate, quantized_ptr); + +#else + pack_vector3_uXX_unsafe(translation, num_bits_at_bit_rate, quantized_ptr); + +#endif + } } } @@ -575,7 +615,17 @@ namespace acl const rtm::vector4f normalized_scale = normalize_sample(scale, clip_range); uint8_t* quantized_ptr = quantized_stream.get_raw_sample_ptr(0); + +#ifdef ACL_PRECISION_BOOST + + pack_vector3_sn48_unsafe_precise_endpoints(normalized_scale, quantized_ptr); + +#else + pack_vector3_u48_unsafe(normalized_scale, quantized_ptr); + +#endif + } else { @@ -593,7 +643,17 @@ namespace acl else { const rtm::vector4f scale = raw_segment_stream.get_raw_sample(sample_index); + +#ifdef ACL_PRECISION_BOOST + + pack_vector3_snXX_unsafe(scale, num_bits_at_bit_rate, quantized_ptr); + +#else + pack_vector3_uXX_unsafe(scale, num_bits_at_bit_rate, quantized_ptr); + +#endif + } } } diff --git a/includes/acl/compression/impl/quantize_track_impl.h b/includes/acl/compression/impl/quantize_track_impl.h index 08d7bdb1..133fa729 100644 --- a/includes/acl/compression/impl/quantize_track_impl.h +++ b/includes/acl/compression/impl/quantize_track_impl.h @@ -45,18 +45,57 @@ namespace acl rtm::vector4f max_value; rtm::vector4f inv_max_value; +#ifdef ACL_PRECISION_BOOST + + rtm::vector4f limit; + rtm::vector4f mid_compress; + rtm::vector4f mid_decompress; + +#endif + explicit quantization_scales(uint32_t num_bits) { ACL_ASSERT(num_bits > 0, "Cannot decay with 0 bits"); + +#ifdef ACL_PRECISION_BOOST + + ACL_ASSERT(num_bits < 25, "Attempting to decay on too many bits"); + + const float max_value_ = rtm::scalar_safe_to_float(1 << num_bits); + limit = rtm::vector_set(max_value_ - 1.0F); + mid_compress = rtm::vector_set(0.5F * max_value_); + mid_decompress = rtm::vector_set((0.5F * max_value_) - 0.5F); + +#else + ACL_ASSERT(num_bits < 31, "Attempting to decay on too many bits"); const float max_value_ = rtm::scalar_safe_to_float((1 << num_bits) - 1); + +#endif + max_value = rtm::vector_set(max_value_); inv_max_value = rtm::vector_set(1.0F / max_value_); } }; // Decays the input value through quantization by packing and unpacking a normalized input value + +#ifdef ACL_PRECISION_BOOST + + inline rtm::vector4f RTM_SIMD_CALL decay_vector4_snXX(rtm::vector4f_arg0 value, const quantization_scales& scales) + { + using namespace rtm; + + ACL_ASSERT(vector_all_greater_equal(value, rtm::vector_set(-0.5F)) && vector_all_less_equal(value, rtm::vector_set(0.5F)), "Expected normalized signed input value: %f, %f, %f, %f", (float)vector_get_x(value), (float)vector_get_y(value), (float)vector_get_z(value), (float)vector_get_w(value)); + + const vector4f packed_value = vector_min(vector_add(vector_floor(vector_mul(value, scales.max_value)), scales.mid_compress), scales.limit); + const vector4f decayed_value = vector_mul(vector_sub(packed_value, scales.mid_decompress), scales.inv_max_value); + return decayed_value; + } + +#else + inline rtm::vector4f RTM_SIMD_CALL decay_vector4_uXX(rtm::vector4f_arg0 value, const quantization_scales& scales) { using namespace rtm; @@ -67,8 +106,23 @@ namespace acl const vector4f decayed_value = vector_mul(packed_value, scales.inv_max_value); return decayed_value; } + +#endif // Packs a normalized input value through quantization + +#ifdef ACL_PRECISION_BOOST + + inline rtm::vector4f RTM_SIMD_CALL pack_vector4_snXX(rtm::vector4f_arg0 value, const quantization_scales& scales) + { + using namespace rtm; + ACL_ASSERT(vector_all_greater_equal(value, rtm::vector_set(-0.5F)) && vector_all_less_equal(value, rtm::vector_set(0.5F)), "Expected normalized signed input value: %f, %f, %f, %f", (float)vector_get_x(value), (float)vector_get_y(value), (float)vector_get_z(value), (float)vector_get_w(value)); + + return vector_min(vector_add(vector_floor(vector_mul(value, scales.max_value)), scales.mid_compress), scales.limit); + } + +#else + inline rtm::vector4f RTM_SIMD_CALL pack_vector4_uXX(rtm::vector4f_arg0 value, const quantization_scales& scales) { using namespace rtm; @@ -78,6 +132,8 @@ namespace acl return vector_round_symmetric(vector_mul(value, scales.max_value)); } +#endif + inline void quantize_scalarf_track(track_list_context& context, uint32_t track_index) { using namespace rtm; @@ -90,7 +146,17 @@ namespace acl const uint32_t num_samples = mut_track.get_num_samples(); const scalarf_range& range = context.range_list[track_index].range.scalarf; + +#ifdef ACL_PRECISION_BOOST + + const vector4f range_center = range.get_center(); + +#else + const vector4f range_min = range.get_min(); + +#endif + const vector4f range_extent = range.get_extent(); const vector4f zero = vector_zero(); @@ -113,6 +179,16 @@ namespace acl std::memcpy(&raw_sample, ref_track[sample_index], ref_element_size); const vector4f normalized_sample = mut_track[sample_index]; + +#ifdef ACL_PRECISION_BOOST + + // Decay our value through quantization + const vector4f decayed_normalized_sample = decay_vector4_snXX(normalized_sample, scales); + + // Undo normalization + const vector4f decayed_sample = vector_mul_add(decayed_normalized_sample, range_extent, range_center); + +#else // Decay our value through quantization const vector4f decayed_normalized_sample = decay_vector4_uXX(normalized_sample, scales); @@ -120,6 +196,8 @@ namespace acl // Undo normalization const vector4f decayed_sample = vector_mul_add(decayed_normalized_sample, range_extent, range_min); +#endif + const vector4f delta = vector_abs(vector_sub(raw_sample, decayed_sample)); const vector4f masked_delta = vector_select(sample_mask, delta, zero); if (!vector_all_less_equal(masked_delta, precision)) @@ -152,7 +230,17 @@ namespace acl const quantization_scales scales(num_bits_at_bit_rate); for (uint32_t sample_index = 0; sample_index < num_samples; ++sample_index) + +#ifdef ACL_PRECISION_BOOST + + mut_track[sample_index] = pack_vector4_snXX(mut_track[sample_index], scales); + +#else + mut_track[sample_index] = pack_vector4_uXX(mut_track[sample_index], scales); + +#endif + } } diff --git a/includes/acl/compression/impl/sample_streams.h b/includes/acl/compression/impl/sample_streams.h index 41bba13d..1f2f7ffe 100644 --- a/includes/acl/compression/impl/sample_streams.h +++ b/includes/acl/compression/impl/sample_streams.h @@ -60,14 +60,34 @@ namespace acl ACL_ASSERT(bit_rate != k_invalid_bit_rate, "Invalid bit rate!"); if (is_constant_bit_rate(bit_rate)) { + +#ifdef ACL_PRECISION_BOOST + + return unpack_vector3_sn48_unsafe_precise_endpoints(ptr); + +#else + return unpack_vector3_u48_unsafe(ptr); + +#endif + } else if (is_raw_bit_rate(bit_rate)) return unpack_vector3_96_unsafe(ptr); else { const uint32_t num_bits_at_bit_rate = get_num_bits_at_bit_rate(bit_rate); + +#ifdef ACL_PRECISION_BOOST + + return unpack_vector3_snXX_unsafe(num_bits_at_bit_rate, ptr, 0); + +#else + return unpack_vector3_uXX_unsafe(num_bits_at_bit_rate, ptr, 0); + +#endif + } default: ACL_ASSERT(false, "Invalid or unsupported rotation format: %s", get_rotation_format_name(format)); @@ -84,13 +104,33 @@ namespace acl case vector_format8::vector3f_variable: ACL_ASSERT(bit_rate != k_invalid_bit_rate, "Invalid bit rate!"); if (is_constant_bit_rate(bit_rate)) + +#ifdef ACL_PRECISION_BOOST + + return unpack_vector3_sn48_unsafe_precise_endpoints(ptr); + +#else + return unpack_vector3_u48_unsafe(ptr); + +#endif + else if (is_raw_bit_rate(bit_rate)) return unpack_vector3_96_unsafe(ptr); else { const uint32_t num_bits_at_bit_rate = get_num_bits_at_bit_rate(bit_rate); + +#ifdef ACL_PRECISION_BOOST + + return unpack_vector3_snXX_unsafe(num_bits_at_bit_rate, ptr, 0); + +#else + return unpack_vector3_uXX_unsafe(num_bits_at_bit_rate, ptr, 0); + +#endif + } default: ACL_ASSERT(false, "Invalid or unsupported vector format: %s", get_vector_format_name(format)); @@ -135,18 +175,42 @@ namespace acl { const transform_range& segment_bone_range = segment->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f segment_range_center = segment_bone_range.rotation.get_center(); + const rtm::vector4f segment_range_extent = segment_bone_range.rotation.get_extent(); + + packed_rotation = rtm::vector_mul_add(packed_rotation, segment_range_extent, segment_range_center); + +#else + const rtm::vector4f segment_range_min = segment_bone_range.rotation.get_min(); const rtm::vector4f segment_range_extent = segment_bone_range.rotation.get_extent(); packed_rotation = rtm::vector_mul_add(packed_rotation, segment_range_extent, segment_range_min); + +#endif + } const transform_range& clip_bone_range = clip->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f clip_range_center = clip_bone_range.rotation.get_center(); + const rtm::vector4f clip_range_extent = clip_bone_range.rotation.get_extent(); + + packed_rotation = rtm::vector_mul_add(packed_rotation, clip_range_extent, clip_range_center); + +#else + const rtm::vector4f clip_range_min = clip_bone_range.rotation.get_min(); const rtm::vector4f clip_range_extent = clip_bone_range.rotation.get_extent(); packed_rotation = rtm::vector_mul_add(packed_rotation, clip_range_extent, clip_range_min); + +#endif + } return acl_impl::rotation_to_quat_32(packed_rotation, format); @@ -187,31 +251,73 @@ namespace acl const transform_range& clip_bone_range = segment->clip->ranges[bone_steams.bone_index]; const rtm::vector4f normalized_rotation = normalize_sample(rotation, clip_bone_range.rotation); +#ifdef ACL_PRECISION_BOOST + + packed_rotation = decay_vector3_sn48_precise_endpoints(normalized_rotation); + +#else + packed_rotation = decay_vector3_u48(normalized_rotation); + +#endif + } else if (is_raw_bit_rate(bit_rate)) packed_rotation = rotation; else + +#ifdef ACL_PRECISION_BOOST + + packed_rotation = decay_vector3_snXX(rotation, num_bits_at_bit_rate); + +#else + packed_rotation = decay_vector3_uXX(rotation, num_bits_at_bit_rate); +#endif + if (!is_raw_bit_rate(bit_rate)) { if (segment->are_rotations_normalized && !is_constant_bit_rate(bit_rate)) { const transform_range& segment_bone_range = segment->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f segment_range_center = segment_bone_range.rotation.get_center(); + const rtm::vector4f segment_range_extent = segment_bone_range.rotation.get_extent(); + + packed_rotation = rtm::vector_mul_add(packed_rotation, segment_range_extent, segment_range_center); + +#else + const rtm::vector4f segment_range_min = segment_bone_range.rotation.get_min(); const rtm::vector4f segment_range_extent = segment_bone_range.rotation.get_extent(); packed_rotation = rtm::vector_mul_add(packed_rotation, segment_range_extent, segment_range_min); + +#endif + } const transform_range& clip_bone_range = clip->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f clip_range_center = clip_bone_range.rotation.get_center(); + const rtm::vector4f clip_range_extent = clip_bone_range.rotation.get_extent(); + + packed_rotation = rtm::vector_mul_add(packed_rotation, clip_range_extent, clip_range_center); + +#else + const rtm::vector4f clip_range_min = clip_bone_range.rotation.get_min(); const rtm::vector4f clip_range_extent = clip_bone_range.rotation.get_extent(); packed_rotation = rtm::vector_mul_add(packed_rotation, clip_range_extent, clip_range_min); + +#endif + } return acl_impl::rotation_to_quat_32(packed_rotation, format); @@ -250,18 +356,43 @@ namespace acl { const transform_range& segment_bone_range = segment->ranges[bone_steams.bone_index]; + +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f segment_range_center = segment_bone_range.rotation.get_center(); + const rtm::vector4f segment_range_extent = segment_bone_range.rotation.get_extent(); + + packed_rotation = rtm::vector_mul_add(packed_rotation, segment_range_extent, segment_range_center); + +#else + const rtm::vector4f segment_range_min = segment_bone_range.rotation.get_min(); const rtm::vector4f segment_range_extent = segment_bone_range.rotation.get_extent(); packed_rotation = rtm::vector_mul_add(packed_rotation, segment_range_extent, segment_range_min); + +#endif + } const transform_range& clip_bone_range = clip->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f clip_range_center = clip_bone_range.rotation.get_center(); + const rtm::vector4f clip_range_extent = clip_bone_range.rotation.get_extent(); + + packed_rotation = rtm::vector_mul_add(packed_rotation, clip_range_extent, clip_range_center); + +#else + const rtm::vector4f clip_range_min = clip_bone_range.rotation.get_min(); const rtm::vector4f clip_range_extent = clip_bone_range.rotation.get_extent(); packed_rotation = rtm::vector_mul_add(packed_rotation, clip_range_extent, clip_range_min); + +#endif + } return acl_impl::rotation_to_quat_32(packed_rotation, format); @@ -290,18 +421,42 @@ namespace acl { const transform_range& segment_bone_range = segment->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f segment_range_center = segment_bone_range.translation.get_center(); + const rtm::vector4f segment_range_extent = segment_bone_range.translation.get_extent(); + + packed_translation = rtm::vector_mul_add(packed_translation, segment_range_extent, segment_range_center); + +#else + const rtm::vector4f segment_range_min = segment_bone_range.translation.get_min(); const rtm::vector4f segment_range_extent = segment_bone_range.translation.get_extent(); packed_translation = rtm::vector_mul_add(packed_translation, segment_range_extent, segment_range_min); + +#endif + } const transform_range& clip_bone_range = clip->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f clip_range_center = clip_bone_range.translation.get_center(); + const rtm::vector4f clip_range_extent = clip_bone_range.translation.get_extent(); + + packed_translation = rtm::vector_mul_add(packed_translation, clip_range_extent, clip_range_center); + +#else + const rtm::vector4f clip_range_min = clip_bone_range.translation.get_min(); const rtm::vector4f clip_range_extent = clip_bone_range.translation.get_extent(); packed_translation = rtm::vector_mul_add(packed_translation, clip_range_extent, clip_range_min); + +#endif + } return packed_translation; @@ -336,14 +491,33 @@ namespace acl const transform_range& clip_bone_range = segment->clip->ranges[bone_steams.bone_index]; const rtm::vector4f normalized_translation = normalize_sample(translation, clip_bone_range.translation); +#ifdef ACL_PRECISION_BOOST + + packed_translation = decay_vector3_sn48_precise_endpoints(normalized_translation); + +#else + packed_translation = decay_vector3_u48(normalized_translation); + +#endif + } else if (is_raw_bit_rate(bit_rate)) packed_translation = translation; else { const uint32_t num_bits_at_bit_rate = get_num_bits_at_bit_rate(bit_rate); + +#ifdef ACL_PRECISION_BOOST + + packed_translation = decay_vector3_snXX(translation, num_bits_at_bit_rate); + +#else + packed_translation = decay_vector3_uXX(translation, num_bits_at_bit_rate); + +#endif + } if (!is_raw_bit_rate(bit_rate)) @@ -352,18 +526,42 @@ namespace acl { const transform_range& segment_bone_range = segment->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f segment_range_center = segment_bone_range.translation.get_center(); + const rtm::vector4f segment_range_extent = segment_bone_range.translation.get_extent(); + + packed_translation = rtm::vector_mul_add(packed_translation, segment_range_extent, segment_range_center); + +#else + const rtm::vector4f segment_range_min = segment_bone_range.translation.get_min(); const rtm::vector4f segment_range_extent = segment_bone_range.translation.get_extent(); packed_translation = rtm::vector_mul_add(packed_translation, segment_range_extent, segment_range_min); + +#endif + } const transform_range& clip_bone_range = clip->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f clip_range_center = clip_bone_range.translation.get_center(); + const rtm::vector4f clip_range_extent = clip_bone_range.translation.get_extent(); + + packed_translation = rtm::vector_mul_add(packed_translation, clip_range_extent, clip_range_center); + +#else + const rtm::vector4f clip_range_min = clip_bone_range.translation.get_min(); const rtm::vector4f clip_range_extent = clip_bone_range.translation.get_extent(); packed_translation = rtm::vector_mul_add(packed_translation, clip_range_extent, clip_range_min); + +#endif + } return packed_translation; @@ -400,18 +598,43 @@ namespace acl { const transform_range& segment_bone_range = segment->ranges[bone_steams.bone_index]; + +#ifdef ACL_PRECISION_BOOST + + rtm::vector4f segment_range_center = segment_bone_range.translation.get_center(); + rtm::vector4f segment_range_extent = segment_bone_range.translation.get_extent(); + + packed_translation = rtm::vector_mul_add(packed_translation, segment_range_extent, segment_range_center); + +#else + rtm::vector4f segment_range_min = segment_bone_range.translation.get_min(); rtm::vector4f segment_range_extent = segment_bone_range.translation.get_extent(); packed_translation = rtm::vector_mul_add(packed_translation, segment_range_extent, segment_range_min); + +#endif + } const transform_range& clip_bone_range = clip->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + rtm::vector4f clip_range_center = clip_bone_range.translation.get_center(); + rtm::vector4f clip_range_extent = clip_bone_range.translation.get_extent(); + + packed_translation = rtm::vector_mul_add(packed_translation, clip_range_extent, clip_range_center); + +#else + rtm::vector4f clip_range_min = clip_bone_range.translation.get_min(); rtm::vector4f clip_range_extent = clip_bone_range.translation.get_extent(); packed_translation = rtm::vector_mul_add(packed_translation, clip_range_extent, clip_range_min); + +#endif + } return packed_translation; @@ -439,18 +662,42 @@ namespace acl { const transform_range& segment_bone_range = segment->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f segment_range_center = segment_bone_range.scale.get_center(); + const rtm::vector4f segment_range_extent = segment_bone_range.scale.get_extent(); + + packed_scale = rtm::vector_mul_add(packed_scale, segment_range_extent, segment_range_center); + +#else + const rtm::vector4f segment_range_min = segment_bone_range.scale.get_min(); const rtm::vector4f segment_range_extent = segment_bone_range.scale.get_extent(); packed_scale = rtm::vector_mul_add(packed_scale, segment_range_extent, segment_range_min); + +#endif + } const transform_range& clip_bone_range = clip->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f clip_range_center = clip_bone_range.scale.get_center(); + const rtm::vector4f clip_range_extent = clip_bone_range.scale.get_extent(); + + packed_scale = rtm::vector_mul_add(packed_scale, clip_range_extent, clip_range_center); + +#else + const rtm::vector4f clip_range_min = clip_bone_range.scale.get_min(); const rtm::vector4f clip_range_extent = clip_bone_range.scale.get_extent(); packed_scale = rtm::vector_mul_add(packed_scale, clip_range_extent, clip_range_min); + +#endif + } return packed_scale; @@ -485,14 +732,33 @@ namespace acl const transform_range& clip_bone_range = segment->clip->ranges[bone_steams.bone_index]; const rtm::vector4f normalized_scale = normalize_sample(scale, clip_bone_range.scale); +#ifdef ACL_PRECISION_BOOST + + packed_scale = decay_vector3_sn48_precise_endpoints(normalized_scale); + +#else + packed_scale = decay_vector3_u48(normalized_scale); + +#endif + } else if (is_raw_bit_rate(bit_rate)) packed_scale = scale; else { const uint32_t num_bits_at_bit_rate = get_num_bits_at_bit_rate(bit_rate); + +#ifdef ACL_PRECISION_BOOST + + packed_scale = decay_vector3_snXX(scale, num_bits_at_bit_rate); + +#else + packed_scale = decay_vector3_uXX(scale, num_bits_at_bit_rate); + +#endif + } if (!is_raw_bit_rate(bit_rate)) @@ -501,18 +767,42 @@ namespace acl { const transform_range& segment_bone_range = segment->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f segment_range_center = segment_bone_range.scale.get_center(); + const rtm::vector4f segment_range_extent = segment_bone_range.scale.get_extent(); + + packed_scale = rtm::vector_mul_add(packed_scale, segment_range_extent, segment_range_center); + +#else + const rtm::vector4f segment_range_min = segment_bone_range.scale.get_min(); const rtm::vector4f segment_range_extent = segment_bone_range.scale.get_extent(); packed_scale = rtm::vector_mul_add(packed_scale, segment_range_extent, segment_range_min); + +#endif + } const transform_range& clip_bone_range = clip->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f clip_range_center = clip_bone_range.scale.get_center(); + const rtm::vector4f clip_range_extent = clip_bone_range.scale.get_extent(); + + packed_scale = rtm::vector_mul_add(packed_scale, clip_range_extent, clip_range_center); + +#else + const rtm::vector4f clip_range_min = clip_bone_range.scale.get_min(); const rtm::vector4f clip_range_extent = clip_bone_range.scale.get_extent(); packed_scale = rtm::vector_mul_add(packed_scale, clip_range_extent, clip_range_min); + +#endif + } return packed_scale; @@ -549,18 +839,42 @@ namespace acl { const transform_range& segment_bone_range = segment->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + rtm::vector4f segment_range_center = segment_bone_range.scale.get_center(); + rtm::vector4f segment_range_extent = segment_bone_range.scale.get_extent(); + + packed_scale = rtm::vector_mul_add(packed_scale, segment_range_extent, segment_range_center); + +#else + rtm::vector4f segment_range_min = segment_bone_range.scale.get_min(); rtm::vector4f segment_range_extent = segment_bone_range.scale.get_extent(); packed_scale = rtm::vector_mul_add(packed_scale, segment_range_extent, segment_range_min); + +#endif + } const transform_range& clip_bone_range = clip->ranges[bone_steams.bone_index]; +#ifdef ACL_PRECISION_BOOST + + rtm::vector4f clip_range_center = clip_bone_range.scale.get_center(); + rtm::vector4f clip_range_extent = clip_bone_range.scale.get_extent(); + + packed_scale = rtm::vector_mul_add(packed_scale, clip_range_extent, clip_range_center); + +#else + rtm::vector4f clip_range_min = clip_bone_range.scale.get_min(); rtm::vector4f clip_range_extent = clip_bone_range.scale.get_extent(); packed_scale = rtm::vector_mul_add(packed_scale, clip_range_extent, clip_range_min); + +#endif + } return packed_scale; diff --git a/includes/acl/compression/impl/write_range_data.h b/includes/acl/compression/impl/write_range_data.h index 8c256706..abc204f3 100644 --- a/includes/acl/compression/impl/write_range_data.h +++ b/includes/acl/compression/impl/write_range_data.h @@ -91,6 +91,13 @@ namespace acl const rotation_format8 rotation_format = segment.bone_streams[0].rotations.get_rotation_format(); // The same for every track // Each range entry is a min/extent at most sizeof(float4f) each, 32 bytes total max per sub-track, 4 sub-tracks per group + +#ifdef ACL_PRECISION_BOOST + + // This is really "range_group_center", but the sheer number of edits aren't worth it. + +#endif + rtm::vector4f range_group_min[4]; rtm::vector4f range_group_extent[4]; @@ -112,7 +119,16 @@ namespace acl { const transform_range& bone_range = clip.ranges[bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_min = bone_range.rotation.get_center(); + +#else + const rtm::vector4f range_min = bone_range.rotation.get_min(); + +#endif + const rtm::vector4f range_extent = bone_range.rotation.get_extent(); range_group_min[group_size] = range_min; @@ -122,7 +138,16 @@ namespace acl { const transform_range& bone_range = clip.ranges[bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_min = bone_range.translation.get_center(); + +#else + const rtm::vector4f range_min = bone_range.translation.get_min(); + +#endif + const rtm::vector4f range_extent = bone_range.translation.get_extent(); range_group_min[group_size] = range_min; @@ -132,7 +157,16 @@ namespace acl { const transform_range& bone_range = clip.ranges[bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_min = bone_range.scale.get_center(); + +#else + const rtm::vector4f range_min = bone_range.scale.get_min(); + +#endif + const rtm::vector4f range_extent = bone_range.scale.get_extent(); range_group_min[group_size] = range_min; @@ -258,15 +292,34 @@ namespace acl { const transform_range& bone_range = segment.ranges[bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_center = bone_range.rotation.get_center(); + +#else + const rtm::vector4f range_min = bone_range.rotation.get_min(); + +#endif + const rtm::vector4f range_extent = bone_range.rotation.get_extent(); // Swizzle into SOA form alignas(16) uint8_t range_min_buffer[16]; alignas(16) uint8_t range_extent_buffer[16]; + +#ifdef ACL_PRECISION_BOOST + + pack_vector3_sn24_unsafe_precise_endpoints_midpoint(range_center, range_min_buffer); + pack_vector3_u24_unsafe_precise_endpoints_midpoint(range_extent, range_extent_buffer); + +#else + pack_vector3_u24_unsafe(range_min, range_min_buffer); pack_vector3_u24_unsafe(range_extent, range_extent_buffer); +#endif + range_data_group[group_size + 0] = range_min_buffer[0]; range_data_group[group_size + 4] = range_min_buffer[1]; range_data_group[group_size + 8] = range_min_buffer[2]; @@ -288,12 +341,32 @@ namespace acl { const transform_range& bone_range = segment.ranges[bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_center = bone_range.translation.get_center(); + +#else + const rtm::vector4f range_min = bone_range.translation.get_min(); + +#endif + const rtm::vector4f range_extent = bone_range.translation.get_extent(); uint8_t* sub_track_range_data = &range_data_group[group_size * 6]; + +#ifdef ACL_PRECISION_BOOST + + pack_vector3_sn24_unsafe_precise_endpoints_midpoint(range_center, sub_track_range_data); + pack_vector3_u24_unsafe_precise_endpoints_midpoint(range_extent, sub_track_range_data + 3); + +#else + pack_vector3_u24_unsafe(range_min, sub_track_range_data); pack_vector3_u24_unsafe(range_extent, sub_track_range_data + 3); + +#endif + } } else @@ -308,12 +381,32 @@ namespace acl { const transform_range& bone_range = segment.ranges[bone_index]; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_center = bone_range.scale.get_center(); + +#else + const rtm::vector4f range_min = bone_range.scale.get_min(); + +#endif + const rtm::vector4f range_extent = bone_range.scale.get_extent(); uint8_t* sub_track_range_data = &range_data_group[group_size * 6]; + +#ifdef ACL_PRECISION_BOOST + + pack_vector3_sn24_unsafe_precise_endpoints_midpoint(range_center, sub_track_range_data); + pack_vector3_u24_unsafe_precise_endpoints_midpoint(range_extent, sub_track_range_data + 3); + +#else + pack_vector3_u24_unsafe(range_min, sub_track_range_data); pack_vector3_u24_unsafe(range_extent, sub_track_range_data + 3); + +#endif + } } }; diff --git a/includes/acl/compression/impl/write_track_data_impl.h b/includes/acl/compression/impl/write_track_data_impl.h index 86bb8f72..67629e1b 100644 --- a/includes/acl/compression/impl/write_track_data_impl.h +++ b/includes/acl/compression/impl/write_track_data_impl.h @@ -112,9 +112,29 @@ namespace acl { // Only support scalarf for now ACL_ASSERT(range.category == track_category8::scalarf, "Unsupported category"); + +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f range_center = range.range.scalarf.get_center(); + +#else + const rtm::vector4f range_min = range.range.scalarf.get_min(); + +#endif + const rtm::vector4f range_extent = range.range.scalarf.get_extent(); + +#ifdef ACL_PRECISION_BOOST + + std::memcpy(output_buffer, &range_center, element_size); + +#else + std::memcpy(output_buffer, &range_min, element_size); + +#endif + std::memcpy(output_buffer + element_size, &range_extent, element_size); } diff --git a/includes/acl/core/impl/compiler_utils.h b/includes/acl/core/impl/compiler_utils.h index 355dd5ad..876d4084 100644 --- a/includes/acl/core/impl/compiler_utils.h +++ b/includes/acl/core/impl/compiler_utils.h @@ -92,3 +92,19 @@ namespace acl #else #define ACL_SWITCH_CASE_FALLTHROUGH_INTENTIONAL (void)0 #endif + +////////////////////////////////////////////////////////////////////////// +// +// Integers between 0 and 2^24 are 100% accurate as floats. Leverage this with a maximum quantization of 24 bits. +// +// Floating point */ with 2^x is precision-friendly. It shifts the exponent without touching the mantissa. This drives +// our quantization. +// +// Normalizing to 0.0f..1.0F is less accurate than normalizing to -0.5F..0.5F. The latter range can handle +// 1/(2^25), which is the error term of 24 bit quantization. +// +// Always floor after scaling, and before shifting from -halfQ..halfQ to 0..fullQ. Otherwise, IEEE float addition will +// round the result before you get a chance to floor it. +// +////////////////////////////////////////////////////////////////////////// +#define ACL_PRECISION_BOOST diff --git a/includes/acl/decompression/impl/scalar_track_decompression.h b/includes/acl/decompression/impl/scalar_track_decompression.h index e8e966ca..08392e47 100644 --- a/includes/acl/decompression/impl/scalar_track_decompression.h +++ b/includes/acl/decompression/impl/scalar_track_decompression.h @@ -191,6 +191,19 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + value0 = unpack_scalarf_snXX_unsafe(num_bits_per_component, animated_values, track_bit_offset0); + value1 = unpack_scalarf_snXX_unsafe(num_bits_per_component, animated_values, track_bit_offset1); + + const rtm::scalarf range_center = rtm::scalar_load(range_values); + const rtm::scalarf range_extent = rtm::scalar_load(range_values + 1); + value0 = rtm::scalar_mul_add(value0, range_extent, range_center); + value1 = rtm::scalar_mul_add(value1, range_extent, range_center); + +#else + value0 = unpack_scalarf_uXX_unsafe(num_bits_per_component, animated_values, track_bit_offset0); value1 = unpack_scalarf_uXX_unsafe(num_bits_per_component, animated_values, track_bit_offset1); @@ -198,6 +211,9 @@ namespace acl const rtm::scalarf range_extent = rtm::scalar_load(range_values + 1); value0 = rtm::scalar_mul_add(value0, range_extent, range_min); value1 = rtm::scalar_mul_add(value1, range_extent, range_min); + +#endif + range_values += 2; } @@ -229,6 +245,19 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + value0 = unpack_vector2_snXX_unsafe(num_bits_per_component, animated_values, track_bit_offset0); + value1 = unpack_vector2_snXX_unsafe(num_bits_per_component, animated_values, track_bit_offset1); + + const rtm::vector4f range_center = rtm::vector_load(range_values); + const rtm::vector4f range_extent = rtm::vector_load(range_values + 2); + value0 = rtm::vector_mul_add(value0, range_extent, range_center); + value1 = rtm::vector_mul_add(value1, range_extent, range_center); + +#else + value0 = unpack_vector2_uXX_unsafe(num_bits_per_component, animated_values, track_bit_offset0); value1 = unpack_vector2_uXX_unsafe(num_bits_per_component, animated_values, track_bit_offset1); @@ -236,6 +265,9 @@ namespace acl const rtm::vector4f range_extent = rtm::vector_load(range_values + 2); value0 = rtm::vector_mul_add(value0, range_extent, range_min); value1 = rtm::vector_mul_add(value1, range_extent, range_min); + +#endif + range_values += 4; } @@ -267,6 +299,19 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + value0 = unpack_vector3_snXX_unsafe(num_bits_per_component, animated_values, track_bit_offset0); + value1 = unpack_vector3_snXX_unsafe(num_bits_per_component, animated_values, track_bit_offset1); + + const rtm::vector4f range_center = rtm::vector_load(range_values); + const rtm::vector4f range_extent = rtm::vector_load(range_values + 3); + value0 = rtm::vector_mul_add(value0, range_extent, range_center); + value1 = rtm::vector_mul_add(value1, range_extent, range_center); + +#else + value0 = unpack_vector3_uXX_unsafe(num_bits_per_component, animated_values, track_bit_offset0); value1 = unpack_vector3_uXX_unsafe(num_bits_per_component, animated_values, track_bit_offset1); @@ -274,6 +319,9 @@ namespace acl const rtm::vector4f range_extent = rtm::vector_load(range_values + 3); value0 = rtm::vector_mul_add(value0, range_extent, range_min); value1 = rtm::vector_mul_add(value1, range_extent, range_min); + +#endif + range_values += 6; } @@ -305,6 +353,19 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + value0 = unpack_vector4_snXX_unsafe(num_bits_per_component, animated_values, track_bit_offset0); + value1 = unpack_vector4_snXX_unsafe(num_bits_per_component, animated_values, track_bit_offset1); + + const rtm::vector4f range_center = rtm::vector_load(range_values); + const rtm::vector4f range_extent = rtm::vector_load(range_values + 4); + value0 = rtm::vector_mul_add(value0, range_extent, range_center); + value1 = rtm::vector_mul_add(value1, range_extent, range_center); + +#else + value0 = unpack_vector4_uXX_unsafe(num_bits_per_component, animated_values, track_bit_offset0); value1 = unpack_vector4_uXX_unsafe(num_bits_per_component, animated_values, track_bit_offset1); @@ -312,6 +373,9 @@ namespace acl const rtm::vector4f range_extent = rtm::vector_load(range_values + 4); value0 = rtm::vector_mul_add(value0, range_extent, range_min); value1 = rtm::vector_mul_add(value1, range_extent, range_min); + +#endif + range_values += 8; } @@ -343,6 +407,19 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + value0 = unpack_vector4_snXX_unsafe(num_bits_per_component, animated_values, track_bit_offset0); + value1 = unpack_vector4_snXX_unsafe(num_bits_per_component, animated_values, track_bit_offset1); + + const rtm::vector4f range_center = rtm::vector_load(range_values); + const rtm::vector4f range_extent = rtm::vector_load(range_values + 4); + value0 = rtm::vector_mul_add(value0, range_extent, range_center); + value1 = rtm::vector_mul_add(value1, range_extent, range_center); + +#else + value0 = unpack_vector4_uXX_unsafe(num_bits_per_component, animated_values, track_bit_offset0); value1 = unpack_vector4_uXX_unsafe(num_bits_per_component, animated_values, track_bit_offset1); @@ -350,6 +427,9 @@ namespace acl const rtm::vector4f range_extent = rtm::vector_load(range_values + 4); value0 = rtm::vector_mul_add(value0, range_extent, range_min); value1 = rtm::vector_mul_add(value1, range_extent, range_min); + +#endif + range_values += 8; } @@ -435,6 +515,19 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + value0 = unpack_scalarf_snXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[0] + track_bit_offset); + value1 = unpack_scalarf_snXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[1] + track_bit_offset); + + const rtm::scalarf range_center = rtm::scalar_load(range_values); + const rtm::scalarf range_extent = rtm::scalar_load(range_values + num_element_components); + value0 = rtm::scalar_mul_add(value0, range_extent, range_center); + value1 = rtm::scalar_mul_add(value1, range_extent, range_center); + +#else + value0 = unpack_scalarf_uXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[0] + track_bit_offset); value1 = unpack_scalarf_uXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[1] + track_bit_offset); @@ -442,6 +535,9 @@ namespace acl const rtm::scalarf range_extent = rtm::scalar_load(range_values + num_element_components); value0 = rtm::scalar_mul_add(value0, range_extent, range_min); value1 = rtm::scalar_mul_add(value1, range_extent, range_min); + +#endif + } value = rtm::scalar_lerp(value0, value1, interpolation_alpha); @@ -465,6 +561,19 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + value0 = unpack_vector2_snXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[0] + track_bit_offset); + value1 = unpack_vector2_snXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[1] + track_bit_offset); + + const rtm::vector4f range_center = rtm::vector_load(range_values); + const rtm::vector4f range_extent = rtm::vector_load(range_values + num_element_components); + value0 = rtm::vector_mul_add(value0, range_extent, range_center); + value1 = rtm::vector_mul_add(value1, range_extent, range_center); + +#else + value0 = unpack_vector2_uXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[0] + track_bit_offset); value1 = unpack_vector2_uXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[1] + track_bit_offset); @@ -472,6 +581,9 @@ namespace acl const rtm::vector4f range_extent = rtm::vector_load(range_values + num_element_components); value0 = rtm::vector_mul_add(value0, range_extent, range_min); value1 = rtm::vector_mul_add(value1, range_extent, range_min); + +#endif + } value = rtm::vector_lerp(value0, value1, interpolation_alpha); @@ -495,6 +607,19 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + value0 = unpack_vector3_snXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[0] + track_bit_offset); + value1 = unpack_vector3_snXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[1] + track_bit_offset); + + const rtm::vector4f range_center = rtm::vector_load(range_values); + const rtm::vector4f range_extent = rtm::vector_load(range_values + num_element_components); + value0 = rtm::vector_mul_add(value0, range_extent, range_center); + value1 = rtm::vector_mul_add(value1, range_extent, range_center); + +#else + value0 = unpack_vector3_uXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[0] + track_bit_offset); value1 = unpack_vector3_uXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[1] + track_bit_offset); @@ -502,6 +627,9 @@ namespace acl const rtm::vector4f range_extent = rtm::vector_load(range_values + num_element_components); value0 = rtm::vector_mul_add(value0, range_extent, range_min); value1 = rtm::vector_mul_add(value1, range_extent, range_min); + +#endif + } value = rtm::vector_lerp(value0, value1, interpolation_alpha); @@ -525,6 +653,19 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + value0 = unpack_vector4_snXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[0] + track_bit_offset); + value1 = unpack_vector4_snXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[1] + track_bit_offset); + + const rtm::vector4f range_center = rtm::vector_load(range_values); + const rtm::vector4f range_extent = rtm::vector_load(range_values + num_element_components); + value0 = rtm::vector_mul_add(value0, range_extent, range_center); + value1 = rtm::vector_mul_add(value1, range_extent, range_center); + +#else + value0 = unpack_vector4_uXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[0] + track_bit_offset); value1 = unpack_vector4_uXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[1] + track_bit_offset); @@ -532,6 +673,9 @@ namespace acl const rtm::vector4f range_extent = rtm::vector_load(range_values + num_element_components); value0 = rtm::vector_mul_add(value0, range_extent, range_min); value1 = rtm::vector_mul_add(value1, range_extent, range_min); + +#endif + } value = rtm::vector_lerp(value0, value1, interpolation_alpha); @@ -555,6 +699,19 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + value0 = unpack_vector4_snXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[0] + track_bit_offset); + value1 = unpack_vector4_snXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[1] + track_bit_offset); + + const rtm::vector4f range_center = rtm::vector_load(range_values); + const rtm::vector4f range_extent = rtm::vector_load(range_values + num_element_components); + value0 = rtm::vector_mul_add(value0, range_extent, range_center); + value1 = rtm::vector_mul_add(value1, range_extent, range_center); + +#else + value0 = unpack_vector4_uXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[0] + track_bit_offset); value1 = unpack_vector4_uXX_unsafe(num_bits_per_component, animated_values, context.key_frame_bit_offsets[1] + track_bit_offset); @@ -562,6 +719,9 @@ namespace acl const rtm::vector4f range_extent = rtm::vector_load(range_values + num_element_components); value0 = rtm::vector_mul_add(value0, range_extent, range_min); value1 = rtm::vector_mul_add(value1, range_extent, range_min); + +#endif + } value = rtm::vector_lerp(value0, value1, interpolation_alpha); diff --git a/includes/acl/decompression/impl/transform_animated_track_cache.h b/includes/acl/decompression/impl/transform_animated_track_cache.h index 9c3698f1..31faf748 100644 --- a/includes/acl/decompression/impl/transform_animated_track_cache.h +++ b/includes/acl/decompression/impl/transform_animated_track_cache.h @@ -181,12 +181,25 @@ namespace acl __m128 segment_range_extent_yyyy = _mm_cvtepi32_ps(segment_range_extent_yyyy_u32); __m128 segment_range_extent_zzzz = _mm_cvtepi32_ps(segment_range_extent_zzzz_u32); +#ifdef ACL_PRECISION_BOOST + + const __m128 normalization_value = _mm_set_ps1(1.0F / 254.0F); + const __m128 half_neg = _mm_set_ps1(-0.5F); + + segment_range_min_xxxx = RTM_VECTOR4F_MULV_ADD(segment_range_min_xxxx, normalization_value, half_neg); + segment_range_min_yyyy = RTM_VECTOR4F_MULV_ADD(segment_range_min_yyyy, normalization_value, half_neg); + segment_range_min_zzzz = RTM_VECTOR4F_MULV_ADD(segment_range_min_zzzz, normalization_value, half_neg); + +#else + const __m128 normalization_value = _mm_set_ps1(1.0F / 255.0F); segment_range_min_xxxx = _mm_mul_ps(segment_range_min_xxxx, normalization_value); segment_range_min_yyyy = _mm_mul_ps(segment_range_min_yyyy, normalization_value); segment_range_min_zzzz = _mm_mul_ps(segment_range_min_zzzz, normalization_value); +#endif + segment_range_extent_xxxx = _mm_mul_ps(segment_range_extent_xxxx, normalization_value); segment_range_extent_yyyy = _mm_mul_ps(segment_range_extent_yyyy, normalization_value); segment_range_extent_zzzz = _mm_mul_ps(segment_range_extent_zzzz, normalization_value); @@ -215,12 +228,25 @@ namespace acl float32x4_t segment_range_extent_yyyy = vcvtq_f32_u32(segment_range_extent_yyyy_u32); float32x4_t segment_range_extent_zzzz = vcvtq_f32_u32(segment_range_extent_zzzz_u32); +#ifdef ACL_PRECISION_BOOST + + const float normalization_value = 1.0F / 254.0F; + const float32x4_t half_neg = vdupq_n_f32(-0.5F); + + segment_range_min_xxxx = vmlaq_n_f32(half_neg, segment_range_min_xxxx, normalization_value); + segment_range_min_yyyy = vmlaq_n_f32(half_neg, segment_range_min_yyyy, normalization_value); + segment_range_min_zzzz = vmlaq_n_f32(half_neg, segment_range_min_zzzz, normalization_value); + +#else + const float normalization_value = 1.0F / 255.0F; segment_range_min_xxxx = vmulq_n_f32(segment_range_min_xxxx, normalization_value); segment_range_min_yyyy = vmulq_n_f32(segment_range_min_yyyy, normalization_value); segment_range_min_zzzz = vmulq_n_f32(segment_range_min_zzzz, normalization_value); +#endif + segment_range_extent_xxxx = vmulq_n_f32(segment_range_extent_xxxx, normalization_value); segment_range_extent_yyyy = vmulq_n_f32(segment_range_extent_yyyy, normalization_value); segment_range_extent_zzzz = vmulq_n_f32(segment_range_extent_zzzz, normalization_value); @@ -233,12 +259,25 @@ namespace acl rtm::vector4f segment_range_extent_yyyy = rtm::vector_set(float(segment_range_data[16]), float(segment_range_data[17]), float(segment_range_data[18]), float(segment_range_data[19])); rtm::vector4f segment_range_extent_zzzz = rtm::vector_set(float(segment_range_data[20]), float(segment_range_data[21]), float(segment_range_data[22]), float(segment_range_data[23])); +#ifdef ACL_PRECISION_BOOST + + const float normalization_value = 1.0F / 254.0F; + const rtm::vector4f half_neg = rtm::vector_set(-0.5F); + + segment_range_min_xxxx = rtm::vector_mul_add(segment_range_min_xxxx, normalization_value, half_neg); + segment_range_min_yyyy = rtm::vector_mul_add(segment_range_min_yyyy, normalization_value, half_neg); + segment_range_min_zzzz = rtm::vector_mul_add(segment_range_min_zzzz, normalization_value, half_neg); + +#else + const float normalization_value = 1.0F / 255.0F; segment_range_min_xxxx = rtm::vector_mul(segment_range_min_xxxx, normalization_value); segment_range_min_yyyy = rtm::vector_mul(segment_range_min_yyyy, normalization_value); segment_range_min_zzzz = rtm::vector_mul(segment_range_min_zzzz, normalization_value); +#endif + segment_range_extent_xxxx = rtm::vector_mul(segment_range_extent_xxxx, normalization_value); segment_range_extent_yyyy = rtm::vector_mul(segment_range_extent_yyyy, normalization_value); segment_range_extent_zzzz = rtm::vector_mul(segment_range_extent_zzzz, normalization_value); @@ -655,14 +694,44 @@ namespace acl // Convert to floats and normalize __m128i xyz = _mm_setr_epi32(x, y, z, 0); __m128 xyzf = _mm_cvtepi32_ps(xyz); + +#ifdef ACL_PRECISION_BOOST + + rotation_as_vec = _mm_mul_ps(_mm_sub_ps(xyzf, _mm_set_ps1(32767.5F)), _mm_set_ps1(1.0F / 65535.0F)); + +#else + rotation_as_vec = _mm_mul_ps(xyzf, _mm_set_ps1(1.0F / 65535.0F)); + +#endif + #elif defined(RTM_NEON_INTRINSICS) uint32x4_t xyz = vcombine_u32(vcreate_u32((uint64_t(y) << 32) | x), vcreate_u32(z)); float32x4_t xyzf = vcvtq_f32_u32(xyz); + +#ifdef ACL_PRECISION_BOOST + + rotation_as_vec = vmulq_n_f32(vsubq_f32(xyzf, vdupq_n_f32(32767.5F)), 1.0F / 65535.0F); + +#else + rotation_as_vec = vmulq_n_f32(xyzf, 1.0F / 65535.0F); + +#endif + #else const rtm::vector4f xyz = rtm::vector_set(float(x), float(y), float(z), 0.0F); + +#ifdef ACL_PRECISION_BOOST + + rotation_as_vec = rtm::vector_mul(rtm::vector_sub(xyz, rtm::vector_set(32767.5F)), rtm::vector_set(1.0F / 65535.0F)); + +#else + rotation_as_vec = rtm::vector_mul(xyz, 1.0F / 65535.0F); + +#endif + #endif sample_segment_range_ignore_mask = 0xFF; // Ignore segment range @@ -677,7 +746,17 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + rotation_as_vec = unpack_vector3_snXX_unsafe(num_bits_at_bit_rate, animated_track_data, animated_track_data_bit_offset); + +#else + rotation_as_vec = unpack_vector3_uXX_unsafe(num_bits_at_bit_rate, animated_track_data, animated_track_data_bit_offset); + +#endif + animated_track_data_bit_offset += num_bits_at_bit_rate * 3; sample_segment_range_ignore_mask = 0x00; sample_clip_range_ignore_mask = 0x00; @@ -835,14 +914,44 @@ namespace acl // Convert to floats and normalize __m128i xyz = _mm_setr_epi32(x, y, z, 0); __m128 xyzf = _mm_cvtepi32_ps(xyz); + +#ifdef ACL_PRECISION_BOOST + + rotation_as_vec = _mm_mul_ps(_mm_sub_ps(xyzf, _mm_set_ps1(32767.5F)), _mm_set_ps1(1.0F / 65535.0F)); + +#else + rotation_as_vec = _mm_mul_ps(xyzf, _mm_set_ps1(1.0F / 65535.0F)); + +#endif + #elif defined(RTM_NEON_INTRINSICS) uint32x4_t xyz = vcombine_u32(vcreate_u32((uint64_t(y) << 32) | x), vcreate_u32(z)); float32x4_t xyzf = vcvtq_f32_u32(xyz); + +#ifdef ACL_PRECISION_BOOST + + rotation_as_vec = vmulq_n_f32(vsubq_f32(xyzf, vdupq_n_f32(32767.5F)), 1.0F / 65535.0F); + +#else + rotation_as_vec = vmulq_n_f32(xyzf, 1.0F / 65535.0F); + +#endif + #else const rtm::vector4f xyz = rtm::vector_set(float(x), float(y), float(z), 0.0F); + +#ifdef ACL_PRECISION_BOOST + + rotation_as_vec = rtm::vector_mul(rtm::vector_sub(xyz, rtm::vector_set(32767.5F)), rtm::vector_set(1.0F / 65535.0F)); + +#else + rotation_as_vec = rtm::vector_mul(xyz, 1.0F / 65535.0F); + +#endif + #endif segment_range_ignore_mask = 0xFF; // Ignore segment range @@ -856,7 +965,17 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + rotation_as_vec = unpack_vector3_snXX_unsafe(num_bits_at_bit_rate, animated_track_data, animated_track_data_bit_offset); + +#else + rotation_as_vec = unpack_vector3_uXX_unsafe(num_bits_at_bit_rate, animated_track_data, animated_track_data_bit_offset); + +#endif + segment_range_ignore_mask = 0x00; clip_range_ignore_mask = 0x00; } @@ -908,8 +1027,18 @@ namespace acl rtm::vector4f segment_range_extent = rtm::vector_set(float(extent_x), float(extent_y), float(extent_z), 0.0F); #endif +#ifdef ACL_PRECISION_BOOST + + const float normalization_scale = 1.0F / 254.0F; + segment_range_min = rtm::vector_mul_add(segment_range_min, normalization_scale, rtm::vector_set(-0.5F)); + +#else + const float normalization_scale = 1.0F / 255.0F; segment_range_min = rtm::vector_mul(segment_range_min, normalization_scale); + +#endif + segment_range_extent = rtm::vector_mul(segment_range_extent, normalization_scale); rotation_as_vec = rtm::vector_mul_add(rotation_as_vec, segment_range_extent, segment_range_min); @@ -966,7 +1095,17 @@ namespace acl if (num_bits_at_bit_rate == 0) // Constant bit rate { + +#ifdef ACL_PRECISION_BOOST + + sample = unpack_vector3_sn48_unsafe_precise_endpoints(segment_range_data); + +#else + sample = unpack_vector3_u48_unsafe(segment_range_data); + +#endif + segment_range_data += sizeof(uint16_t) * 3; range_ignore_flags = 0x01; // Skip segment only } @@ -979,7 +1118,17 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + sample = unpack_vector3_snXX_unsafe(num_bits_at_bit_rate, animated_track_data, animated_track_data_bit_offset); + +#else + sample = unpack_vector3_uXX_unsafe(num_bits_at_bit_rate, animated_track_data, animated_track_data_bit_offset); + +#endif + animated_track_data_bit_offset += num_bits_at_bit_rate * 3; range_ignore_flags = 0x00; // Don't skip range reduction } @@ -1002,10 +1151,22 @@ namespace acl const uint8_t* segment_range_extent_ptr = segment_range_min_ptr + range_entry_size; segment_range_data = segment_range_extent_ptr + range_entry_size; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f segment_range_center = unpack_vector3_sn24_unsafe_precise_endpoints_midpoint(segment_range_min_ptr); + const rtm::vector4f segment_range_extent = unpack_vector3_u24_unsafe_precise_endpoints_midpoint(segment_range_extent_ptr); + + sample = rtm::vector_mul_add(sample, segment_range_extent, segment_range_center); + +#else + const rtm::vector4f segment_range_min = unpack_vector3_u24_unsafe(segment_range_min_ptr); const rtm::vector4f segment_range_extent = unpack_vector3_u24_unsafe(segment_range_extent_ptr); sample = rtm::vector_mul_add(sample, segment_range_extent, segment_range_min); + +#endif + } if ((range_ignore_flags & 0x02) == 0) @@ -1104,7 +1265,17 @@ namespace acl if (num_bits_at_bit_rate == 0) // Constant bit rate { + +#ifdef ACL_PRECISION_BOOST + + sample = unpack_vector3_sn48_unsafe_precise_endpoints(segment_range_data); + +#else + sample = unpack_vector3_u48_unsafe(segment_range_data); + +#endif + range_ignore_flags = 0x01; // Skip segment only } else if (num_bits_at_bit_rate == 32) // Raw bit rate @@ -1114,7 +1285,17 @@ namespace acl } else { + +#ifdef ACL_PRECISION_BOOST + + sample = unpack_vector3_snXX_unsafe(num_bits_at_bit_rate, animated_track_data, animated_track_data_bit_offset); + +#else + sample = unpack_vector3_uXX_unsafe(num_bits_at_bit_rate, animated_track_data, animated_track_data_bit_offset); + +#endif + range_ignore_flags = 0x00; // Don't skip range reduction } } @@ -1131,10 +1312,23 @@ namespace acl if (decomp_context.has_segments && (range_ignore_flags & 0x01) == 0) { // Apply segment range remapping + +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f segment_range_center = unpack_vector3_sn24_unsafe_precise_endpoints_midpoint(segment_range_data); + const rtm::vector4f segment_range_extent = unpack_vector3_u24_unsafe_precise_endpoints_midpoint(segment_range_data + 3 * sizeof(uint8_t)); + + sample = rtm::vector_mul_add(sample, segment_range_extent, segment_range_center); + +#else + const rtm::vector4f segment_range_min = unpack_vector3_u24_unsafe(segment_range_data); const rtm::vector4f segment_range_extent = unpack_vector3_u24_unsafe(segment_range_data + 3 * sizeof(uint8_t)); sample = rtm::vector_mul_add(sample, segment_range_extent, segment_range_min); + +#endif + } if ((range_ignore_flags & 0x02) == 0) diff --git a/includes/acl/math/quat_packing.h b/includes/acl/math/quat_packing.h index de1314ff..34cf49a2 100644 --- a/includes/acl/math/quat_packing.h +++ b/includes/acl/math/quat_packing.h @@ -52,6 +52,8 @@ namespace acl return rtm::vector_to_quat(unpack_vector4_128(data_ptr)); } +#ifndef ACL_PRECISION_BOOST + inline void RTM_SIMD_CALL pack_quat_96(rtm::quatf_arg0 rotation, uint8_t* out_rotation_data) { rtm::vector4f rotation_xyz = rtm::quat_to_vector(rtm::quat_ensure_positive_w(rotation)); @@ -89,6 +91,8 @@ namespace acl return rtm::quat_from_positive_w(rotation_xyz); } +#endif + ////////////////////////////////////////////////////////////////////////// constexpr uint32_t get_packed_rotation_size(rotation_format8 format) diff --git a/includes/acl/math/scalar_packing.h b/includes/acl/math/scalar_packing.h index 57e094f7..69fdff8e 100644 --- a/includes/acl/math/scalar_packing.h +++ b/includes/acl/math/scalar_packing.h @@ -36,6 +36,37 @@ ACL_IMPL_FILE_PRAGMA_PUSH namespace acl { + +#ifdef ACL_PRECISION_BOOST + + // We don't support [0.0F, 1.0F]. + // It's imprecise, and (0.5F, 1.0F) is incapable of representing the error term of 24 bit quantization. + // We support [-0.5F, 0.5F] instead. + inline uint32_t pack_scalar_signed_normalized(float input, uint32_t num_bits) + { + ACL_ASSERT(num_bits > 0, "Attempting to pack on too few bits"); + ACL_ASSERT(num_bits < 25, "Attempting to pack on too many bits"); + ACL_ASSERT(input >= -0.5F && input <= 0.5F, "Expected signed normalized input value: %f", input); + const uint32_t num_values = 1 << num_bits; + const float max_value = rtm::scalar_safe_to_float(num_values); + const float mid_value = 0.5F * max_value; + return std::min(static_cast(rtm::scalar_floor(input * max_value) + mid_value), num_values - 1); + } + + inline float unpack_scalar_signed_normalized(uint32_t input, uint32_t num_bits) + { + ACL_ASSERT(num_bits > 0, "Attempting to unpack from too few bits"); + ACL_ASSERT(num_bits < 25, "Attempting to unpack from too many bits"); + const uint32_t num_values = 1 << num_bits; + ACL_ASSERT(input < num_values, "Input value too large: %ull", input); + const float max_value = rtm::scalar_safe_to_float(num_values); + const float inv_max_value = 1.0F / max_value; + const float mid_value = (0.5F * max_value) - 0.5F; + return (input - mid_value) * inv_max_value; + } + +#else + inline uint32_t pack_scalar_unsigned(float input, uint32_t num_bits) { ACL_ASSERT(num_bits < 31, "Attempting to pack on too many bits"); @@ -64,6 +95,8 @@ namespace acl return (unpack_scalar_unsigned(input, num_bits) * 2.0F) - 1.0F; } +#endif + // Assumes the 'vector_data' is in big-endian order and is padded in order to load up to 8 bytes from it inline rtm::scalarf RTM_SIMD_CALL unpack_scalarf_32_unsafe(const uint8_t* vector_data, uint32_t bit_offset) { @@ -107,18 +140,46 @@ namespace acl } // Assumes the 'vector_data' is in big-endian order and padded in order to load up to 8 bytes from it + +#ifdef ACL_PRECISION_BOOST + + inline rtm::scalarf RTM_SIMD_CALL unpack_scalarf_snXX_unsafe(uint32_t num_bits, const uint8_t* vector_data, uint32_t bit_offset) + +#else + inline rtm::scalarf RTM_SIMD_CALL unpack_scalarf_uXX_unsafe(uint32_t num_bits, const uint8_t* vector_data, uint32_t bit_offset) + +#endif + { ACL_ASSERT(num_bits <= 19, "This function does not support reading more than 19 bits per component"); struct PackedTableEntry { explicit constexpr PackedTableEntry(uint8_t num_bits_) + +#ifdef ACL_PRECISION_BOOST + + : max_value(1.0F / float(1 << num_bits_)) + , mid_value((0.5F * float(1 << num_bits_)) - 0.5F) + +#else + : max_value(num_bits_ == 0 ? 1.0F : (1.0F / float((1 << num_bits_) - 1))) + +#endif + , mask((1 << num_bits_) - 1) {} float max_value; + +#ifdef ACL_PRECISION_BOOST + + float mid_value; + +#endif + uint32_t mask; }; @@ -137,18 +198,40 @@ namespace acl const uint32_t mask = k_packed_constants[num_bits].mask; const __m128 inv_max_value = _mm_load_ps1(&k_packed_constants[num_bits].max_value); +#ifdef ACL_PRECISION_BOOST + + const __m128 mid_value = _mm_load_ps1(&k_packed_constants[num_bits].mid_value); + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); const uint32_t x32 = (vector_u32 >> (bit_shift - (bit_offset % 8))); const __m128 value = _mm_cvtsi32_ss(inv_max_value, x32 & mask); + +#ifdef ACL_PRECISION_BOOST + + return rtm::scalarf{ _mm_mul_ss(_mm_sub_ss(value, mid_value), inv_max_value) }; + +#else + return rtm::scalarf{ _mm_mul_ss(value, inv_max_value) }; + +#endif + #elif defined(RTM_NEON_INTRINSICS) const uint32_t bit_shift = 32 - num_bits; const uint32_t mask = k_packed_constants[num_bits].mask; const float inv_max_value = k_packed_constants[num_bits].max_value; +#ifdef ACL_PRECISION_BOOST + + const float mid_value = k_packed_constants[num_bits].mid_value; + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); @@ -156,18 +239,43 @@ namespace acl const int32_t value_u32 = x32 & mask; const float value_f32 = static_cast(value_u32); + +#ifdef ACL_PRECISION_BOOST + + return (value_f32 - mid_value) * inv_max_value; + +#else + return value_f32 * inv_max_value; + +#endif + #else const uint32_t bit_shift = 32 - num_bits; const uint32_t mask = k_packed_constants[num_bits].mask; const float inv_max_value = k_packed_constants[num_bits].max_value; +#ifdef ACL_PRECISION_BOOST + + const float mid_value = k_packed_constants[num_bits].mid_value; + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); const uint32_t x32 = (vector_u32 >> (bit_shift - (bit_offset % 8))) & mask; +#ifdef ACL_PRECISION_BOOST + + return rtm::scalar_set((static_cast(x32) - mid_value) * inv_max_value); + +#else + return rtm::scalar_set(static_cast(x32) * inv_max_value); + +#endif + #endif } } diff --git a/includes/acl/math/vector4_packing.h b/includes/acl/math/vector4_packing.h index 93d6f8de..4d332ffa 100644 --- a/includes/acl/math/vector4_packing.h +++ b/includes/acl/math/vector4_packing.h @@ -160,6 +160,8 @@ namespace acl #endif } +#ifndef ACL_PRECISION_BOOST + inline void RTM_SIMD_CALL pack_vector4_64(rtm::vector4f_arg0 vector, bool is_unsigned, uint8_t* out_vector_data) { uint32_t vector_x = is_unsigned ? pack_scalar_unsigned(rtm::vector_get_x(vector), 16) : pack_scalar_signed(rtm::vector_get_x(vector), 16); @@ -214,7 +216,40 @@ namespace acl return rtm::vector_set(x, y, z, w); } +#endif + // Packs data in big-endian order and assumes the 'out_vector_data' is padded in order to write up to 16 bytes to it + +#ifdef ACL_PRECISION_BOOST + + inline void RTM_SIMD_CALL pack_vector4_snXX_unsafe(rtm::vector4f_arg0 vector, uint32_t num_bits, uint8_t* out_vector_data) + { + // It's better to test functions which are used by ACL, than write new functions that are only used by unit tests. + // That's easier said than done in this case, so we emulate ACL here instead. + + // /acl/compression/impl/quantize_track_impl.h pack_vector4_snXX + ACL_ASSERT(rtm::vector_all_greater_equal(vector, rtm::vector_set(-0.5F)) && rtm::vector_all_less_equal(vector, rtm::vector_set(0.5F)), "Expected normalized signed input value: %f, %f, %f, %f", (float)rtm::vector_get_x(vector), (float)rtm::vector_get_y(vector), (float)rtm::vector_get_z(vector), (float)rtm::vector_get_w(vector)); + const float max_value = rtm::scalar_safe_to_float(1 << num_bits); + const rtm::vector4f packed_vector = rtm::vector_min(rtm::vector_add(rtm::vector_floor(rtm::vector_mul(vector, rtm::vector_set(max_value))), rtm::vector_set(0.5F * max_value)), rtm::vector_set(max_value - 1.0F)); + + // /acl/compression/impl/write_track_data_impl.h write_track_animated_values + const float* sample_f32 = safe_ptr_cast(&packed_vector); + uint64_t output_bit_offset = 0; + for (uint32_t component_index = 0; component_index < 4; ++component_index) + { + uint32_t value; + + value = safe_static_cast(sample_f32[component_index]); + value = value << (32 - num_bits); + value = byte_swap(value); + + memcpy_bits(out_vector_data, output_bit_offset, &value, 0, num_bits); + output_bit_offset += num_bits; + } + } + +#else + inline void RTM_SIMD_CALL pack_vector4_uXX_unsafe(rtm::vector4f_arg0 vector, uint32_t num_bits, uint8_t* out_vector_data) { uint32_t vector_x = pack_scalar_unsigned(rtm::vector_get_x(vector), num_bits); @@ -236,19 +271,49 @@ namespace acl memcpy_bits(out_vector_data, bit_offset, &vector_u32, 0, num_bits); } +#endif + // Assumes the 'vector_data' is in big-endian order and padded in order to load up to 16 bytes from it + +#ifdef ACL_PRECISION_BOOST + + inline rtm::vector4f RTM_SIMD_CALL unpack_vector4_snXX_unsafe(uint32_t num_bits, const uint8_t* vector_data, uint32_t bit_offset) + +#else + inline rtm::vector4f RTM_SIMD_CALL unpack_vector4_uXX_unsafe(uint32_t num_bits, const uint8_t* vector_data, uint32_t bit_offset) + +#endif + { ACL_ASSERT(num_bits <= 19, "This function does not support reading more than 19 bits per component"); struct PackedTableEntry { explicit constexpr PackedTableEntry(uint8_t num_bits_) + +#ifdef ACL_PRECISION_BOOST + + : max_value(1.0F / float(1 << num_bits_)) + , mid_value((0.5F * float(1 << num_bits_)) - 0.5F) + +#else + : max_value(num_bits_ == 0 ? 1.0F : (1.0F / float((1 << num_bits_) - 1))) + +#endif + , mask((1 << num_bits_) - 1) {} float max_value; + +#ifdef ACL_PRECISION_BOOST + + float mid_value; + +#endif + uint32_t mask; }; @@ -267,6 +332,12 @@ namespace acl const __m128i mask = _mm_castps_si128(_mm_load_ps1((const float*)&k_packed_constants[num_bits].mask)); const __m128 inv_max_value = _mm_load_ps1(&k_packed_constants[num_bits].max_value); +#ifdef ACL_PRECISION_BOOST + + const __m128 mid_value = _mm_load_ps1(&k_packed_constants[num_bits].mid_value); + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); @@ -296,12 +367,28 @@ namespace acl __m128i int_value = _mm_set_epi32(w32, z32, y32, x32); int_value = _mm_and_si128(int_value, mask); const __m128 value = _mm_cvtepi32_ps(int_value); + +#ifdef ACL_PRECISION_BOOST + + return _mm_mul_ps(_mm_sub_ps(value, mid_value), inv_max_value); + +#else + return _mm_mul_ps(value, inv_max_value); + +#endif + #elif defined(RTM_NEON_INTRINSICS) const uint32_t bit_shift = 32 - num_bits; uint32x4_t mask = vdupq_n_u32(k_packed_constants[num_bits].mask); float inv_max_value = k_packed_constants[num_bits].max_value; +#ifdef ACL_PRECISION_BOOST + + const float32x4_t mid_value = vdupq_n_f32(k_packed_constants[num_bits].mid_value); + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); @@ -333,12 +420,28 @@ namespace acl uint32x4_t value_u32 = vcombine_u32(xy, zw); value_u32 = vandq_u32(value_u32, mask); float32x4_t value_f32 = vcvtq_f32_u32(value_u32); + +#ifdef ACL_PRECISION_BOOST + + return vmulq_n_f32(vsubq_f32(value_f32, mid_value), inv_max_value); + +#else + return vmulq_n_f32(value_f32, inv_max_value); + +#endif + #else const uint32_t bit_shift = 32 - num_bits; const uint32_t mask = k_packed_constants[num_bits].mask; const float inv_max_value = k_packed_constants[num_bits].max_value; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f mid_value = rtm::vector_set(k_packed_constants[num_bits].mid_value); + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); @@ -365,7 +468,16 @@ namespace acl vector_u32 = byte_swap(vector_u32); const uint32_t w32 = (vector_u32 >> (bit_shift - (bit_offset % 8))) & mask; +#ifdef ACL_PRECISION_BOOST + + return rtm::vector_mul(rtm::vector_sub(rtm::vector_set(float(x32), float(y32), float(z32), float(w32)), mid_value), inv_max_value); + +#else + return rtm::vector_mul(rtm::vector_set(float(x32), float(y32), float(z32), float(w32)), inv_max_value); + +#endif + #endif } @@ -572,18 +684,34 @@ namespace acl } // Assumes the 'out_vector_data' is padded in order to write up to 16 bytes to it + +#ifdef ACL_PRECISION_BOOST + + inline void RTM_SIMD_CALL pack_vector3_sn48_unsafe_precise_endpoints(rtm::vector4f_arg0 vector, uint8_t* out_vector_data) + { + const rtm::vector4f packed = rtm::vector_add(rtm::vector_floor(rtm::vector_mul(vector, rtm::vector_set(65535.0F))), rtm::vector_set(32768.0F)); + uint32_t vector_x = static_cast(rtm::vector_get_x(packed)); + uint32_t vector_y = static_cast(rtm::vector_get_y(packed)); + uint32_t vector_z = static_cast(rtm::vector_get_z(packed)); + +#else + inline void RTM_SIMD_CALL pack_vector3_u48_unsafe(rtm::vector4f_arg0 vector, uint8_t* out_vector_data) { uint32_t vector_x = pack_scalar_unsigned(rtm::vector_get_x(vector), 16); uint32_t vector_y = pack_scalar_unsigned(rtm::vector_get_y(vector), 16); uint32_t vector_z = pack_scalar_unsigned(rtm::vector_get_z(vector), 16); +#endif + uint16_t* data = safe_ptr_cast(out_vector_data); data[0] = safe_static_cast(vector_x); data[1] = safe_static_cast(vector_y); data[2] = safe_static_cast(vector_z); } +#ifndef ACL_PRECISION_BOOST + // Assumes the 'out_vector_data' is padded in order to write up to 16 bytes to it inline void RTM_SIMD_CALL pack_vector3_s48_unsafe(rtm::vector4f_arg0 vector, uint8_t* out_vector_data) { @@ -597,24 +725,65 @@ namespace acl data[2] = safe_static_cast(vector_z); } +#endif + // Assumes the 'vector_data' is padded in order to load up to 16 bytes from it + +#ifdef ACL_PRECISION_BOOST + + inline rtm::vector4f RTM_SIMD_CALL unpack_vector3_sn48_unsafe_precise_endpoints(const uint8_t* vector_data) + +#else + inline rtm::vector4f RTM_SIMD_CALL unpack_vector3_u48_unsafe(const uint8_t* vector_data) + +#endif + { #if defined(RTM_SSE2_INTRINSICS) __m128i zero = _mm_setzero_si128(); __m128i x16y16z16 = _mm_loadu_si128((const __m128i*)vector_data); __m128i x32y32z32 = _mm_unpacklo_epi16(x16y16z16, zero); __m128 value = _mm_cvtepi32_ps(x32y32z32); + + +#ifdef ACL_PRECISION_BOOST + + return _mm_mul_ps(_mm_sub_ps(value, _mm_set_ps1(32767.5F)), _mm_set_ps1(1.0F / 65535.0F)); + +#else + return _mm_mul_ps(value, _mm_set_ps1(1.0F / 65535.0F)); + +#endif + #elif defined(RTM_NEON_INTRINSICS) uint8x8_t x8y8z8 = vld1_u8(vector_data); uint16x4_t x16y16z16 = vreinterpret_u16_u8(x8y8z8); uint32x4_t x32y32z32 = vmovl_u16(x16y16z16); float32x4_t value = vcvtq_f32_u32(x32y32z32); + +#ifdef ACL_PRECISION_BOOST + + return vmulq_n_f32(vsubq_f32(value, vdupq_n_f32(32767.5F)), 1.0F / 65535.0F); + +#else + return vmulq_n_f32(value, 1.0F / 65535.0F); + +#endif + #else const uint16_t* data_ptr_u16 = safe_ptr_cast(vector_data); + +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f packed = rtm::vector_set(static_cast(data_ptr_u16[0]), static_cast(data_ptr_u16[1]), static_cast(data_ptr_u16[2])); + return rtm::vector_mul(rtm::vector_sub(packed, rtm::vector_set(32767.5F)), rtm::vector_set(1.0F / 65535.0F)); + +#else + uint16_t x16 = data_ptr_u16[0]; uint16_t y16 = data_ptr_u16[1]; uint16_t z16 = data_ptr_u16[2]; @@ -622,9 +791,14 @@ namespace acl float y = unpack_scalar_unsigned(y16, 16); float z = unpack_scalar_unsigned(z16, 16); return rtm::vector_set(x, y, z); + +#endif + #endif } +#ifndef ACL_PRECISION_BOOST + // Assumes the 'vector_data' is padded in order to load up to 16 bytes from it inline rtm::vector4f RTM_SIMD_CALL unpack_vector3_s48_unsafe(const uint8_t* vector_data) { @@ -632,6 +806,21 @@ namespace acl return rtm::vector_neg_mul_sub(unsigned_value, -2.0F, rtm::vector_set(-1.0F)); } +#endif + +#ifdef ACL_PRECISION_BOOST + + inline rtm::vector4f RTM_SIMD_CALL decay_vector3_sn48_precise_endpoints(rtm::vector4f_arg0 input) + { + ACL_ASSERT(rtm::vector_all_greater_equal3(input, rtm::vector_set(-0.5F)) && rtm::vector_all_less_equal3(input, rtm::vector_set(0.5F)), "Expected signed normalized unsigned input value: %f, %f, %f", (float)rtm::vector_get_x(input), (float)rtm::vector_get_y(input), (float)rtm::vector_get_z(input)); + + const rtm::vector4f packed = rtm::vector_add(rtm::vector_floor(rtm::vector_mul(input, rtm::vector_set(65535.0F))), rtm::vector_set(32768.0F)); + const rtm::vector4f decayed = rtm::vector_mul(rtm::vector_sub(packed, rtm::vector_set(32767.5F)), rtm::vector_set(1.0F / 65535.0F)); + return decayed; + } + +#else + inline rtm::vector4f RTM_SIMD_CALL decay_vector3_u48(rtm::vector4f_arg0 input) { ACL_ASSERT(rtm::vector_all_greater_equal3(input, rtm::vector_zero()) && rtm::vector_all_less_equal3(input, rtm::vector_set(1.0F)), "Expected normalized unsigned input value: %f, %f, %f", (float)rtm::vector_get_x(input), (float)rtm::vector_get_y(input), (float)rtm::vector_get_z(input)); @@ -726,19 +915,45 @@ namespace acl return rtm::vector_set(x, y, z); } +#endif + // Assumes the 'out_vector_data' is padded in order to write up to 16 bytes to it + +#ifdef ACL_PRECISION_BOOST + + inline void RTM_SIMD_CALL pack_vector3_sn24_unsafe_precise_endpoints_midpoint(rtm::vector4f_arg0 vector, uint8_t* out_vector_data) + { + const rtm::vector4f packed = rtm::vector_add(rtm::vector_floor(rtm::vector_mul_add(vector, rtm::vector_set(254.0F), rtm::vector_set(0.5F))), rtm::vector_set(127.0F)); + uint32_t vector_x = static_cast(rtm::vector_get_x(packed)); + uint32_t vector_y = static_cast(rtm::vector_get_y(packed)); + uint32_t vector_z = static_cast(rtm::vector_get_z(packed)); + +#else + inline void RTM_SIMD_CALL pack_vector3_u24_unsafe(rtm::vector4f_arg0 vector, uint8_t* out_vector_data) { uint32_t vector_x = pack_scalar_unsigned(rtm::vector_get_x(vector), 8); uint32_t vector_y = pack_scalar_unsigned(rtm::vector_get_y(vector), 8); uint32_t vector_z = pack_scalar_unsigned(rtm::vector_get_z(vector), 8); +#endif + out_vector_data[0] = safe_static_cast(vector_x); out_vector_data[1] = safe_static_cast(vector_y); out_vector_data[2] = safe_static_cast(vector_z); } // Assumes the 'out_vector_data' is padded in order to write up to 16 bytes to it + +#ifdef ACL_PRECISION_BOOST + + inline void RTM_SIMD_CALL pack_vector3_u24_unsafe_precise_endpoints_midpoint(rtm::vector4f_arg0 vector, uint8_t* out_vector_data) + { + return pack_vector3_sn24_unsafe_precise_endpoints_midpoint(rtm::vector_sub(vector, rtm::vector_set(0.5F)), out_vector_data); + } + +#else + inline void RTM_SIMD_CALL pack_vector3_s24_unsafe(rtm::vector4f_arg0 vector, uint8_t* out_vector_data) { uint32_t vector_x = pack_scalar_signed(rtm::vector_get_x(vector), 8); @@ -750,8 +965,20 @@ namespace acl out_vector_data[2] = safe_static_cast(vector_z); } +#endif + // Assumes the 'vector_data' is padded in order to load up to 16 bytes from it + +#ifdef ACL_PRECISION_BOOST + + inline rtm::vector4f RTM_SIMD_CALL unpack_vector3_u24_unsafe_precise_endpoints_midpoint(const uint8_t* vector_data) + +#else + inline rtm::vector4f RTM_SIMD_CALL unpack_vector3_u24_unsafe(const uint8_t* vector_data) + +#endif + { #if defined(RTM_SSE2_INTRINSICS) && 0 // This implementation leverages fast fixed point coercion, it relies on the @@ -771,39 +998,95 @@ namespace acl __m128i x16y16z16 = _mm_unpacklo_epi8(x8y8z8, zero); __m128i x32y32z32 = _mm_unpacklo_epi16(x16y16z16, zero); __m128 value = _mm_cvtepi32_ps(x32y32z32); + +#ifdef ACL_PRECISION_BOOST + + return _mm_mul_ps(value, _mm_set_ps1(1.0F / 254.0F)); + +#else + return _mm_mul_ps(value, _mm_set_ps1(1.0F / 255.0F)); + +#endif + #elif defined(RTM_NEON_INTRINSICS) uint8x8_t x8y8z8 = vld1_u8(vector_data); uint16x8_t x16y16z16 = vmovl_u8(x8y8z8); uint32x4_t x32y32z32 = vmovl_u16(vget_low_u16(x16y16z16)); float32x4_t value = vcvtq_f32_u32(x32y32z32); + +#ifdef ACL_PRECISION_BOOST + + return vmulq_n_f32(value, 1.0F / 254.0F); + +#else + return vmulq_n_f32(value, 1.0F / 255.0F); + +#endif + #else uint8_t x8 = vector_data[0]; uint8_t y8 = vector_data[1]; uint8_t z8 = vector_data[2]; + +#ifdef ACL_PRECISION_BOOST + + return rtm::vector_mul(rtm::vector_set(static_cast(x8), static_cast(y8), static_cast(z8)), 1.0F / 254.0F); + +#else + float x = unpack_scalar_unsigned(x8, 8); float y = unpack_scalar_unsigned(y8, 8); float z = unpack_scalar_unsigned(z8, 8); return rtm::vector_set(x, y, z); + +#endif + #endif } // Assumes the 'vector_data' is padded in order to load up to 16 bytes from it + +#ifdef ACL_PRECISION_BOOST + + inline rtm::vector4f RTM_SIMD_CALL unpack_vector3_sn24_unsafe_precise_endpoints_midpoint(const uint8_t* vector_data) + { + const rtm::vector4f unsigned_value = unpack_vector3_u24_unsafe_precise_endpoints_midpoint(vector_data); + return rtm::vector_sub(unsigned_value, rtm::vector_set(0.5F)); + } + +#else + inline rtm::vector4f RTM_SIMD_CALL unpack_vector3_s24_unsafe(const uint8_t* vector_data) { const rtm::vector4f unsigned_value = unpack_vector3_u24_unsafe(vector_data); return rtm::vector_neg_mul_sub(unsigned_value, -2.0F, rtm::vector_set(-1.0F)); } +#endif + // Packs data in big-endian order and assumes the 'out_vector_data' is padded in order to write up to 16 bytes to it + +#ifdef ACL_PRECISION_BOOST + + inline void RTM_SIMD_CALL pack_vector3_snXX_unsafe(rtm::vector4f_arg0 vector, uint32_t num_bits, uint8_t* out_vector_data) + { + uint32_t vector_x = pack_scalar_signed_normalized(rtm::vector_get_x(vector), num_bits); + uint32_t vector_y = pack_scalar_signed_normalized(rtm::vector_get_y(vector), num_bits); + uint32_t vector_z = pack_scalar_signed_normalized(rtm::vector_get_z(vector), num_bits); + +#else + inline void RTM_SIMD_CALL pack_vector3_uXX_unsafe(rtm::vector4f_arg0 vector, uint32_t num_bits, uint8_t* out_vector_data) { uint32_t vector_x = pack_scalar_unsigned(rtm::vector_get_x(vector), num_bits); uint32_t vector_y = pack_scalar_unsigned(rtm::vector_get_y(vector), num_bits); uint32_t vector_z = pack_scalar_unsigned(rtm::vector_get_z(vector), num_bits); +#endif + uint64_t vector_u64 = static_cast(vector_x) << (64 - num_bits * 1); vector_u64 |= static_cast(vector_y) << (64 - num_bits * 2); vector_u64 |= static_cast(vector_z) << (64 - num_bits * 3); @@ -812,6 +1095,8 @@ namespace acl unaligned_write(vector_u64, out_vector_data); } +#ifndef ACL_PRECISION_BOOST + // Packs data in big-endian order and assumes the 'out_vector_data' is padded in order to write up to 16 bytes to it inline void RTM_SIMD_CALL pack_vector3_sXX_unsafe(rtm::vector4f_arg0 vector, uint32_t num_bits, uint8_t* out_vector_data) { @@ -827,6 +1112,27 @@ namespace acl unaligned_write(vector_u64, out_vector_data); } +#endif + +#ifdef ACL_PRECISION_BOOST + + inline rtm::vector4f RTM_SIMD_CALL decay_vector3_snXX(rtm::vector4f_arg0 value, uint32_t num_bits) + { + ACL_ASSERT(rtm::vector_all_greater_equal3(value, rtm::vector_set(-0.5F)) && rtm::vector_all_less_equal3(value, rtm::vector_set(0.5F)), "Expected normalized signed input value: %f, %f, %f", (float)rtm::vector_get_x(value), (float)rtm::vector_get_y(value), (float)rtm::vector_get_z(value)); + + const float max_value = rtm::scalar_safe_to_float(1 << num_bits); + const float inv_max_value = 1.0F / max_value; + const float limit = max_value - 1.0F; + const float mid_compress = 0.5F * max_value; + const float mid_decompress = (0.5F * max_value) - 0.5F; + + const rtm::vector4f packed = rtm::vector_min(rtm::vector_add(rtm::vector_floor(rtm::vector_mul(value, max_value)), rtm::vector_set(mid_compress)), rtm::vector_set(limit)); + const rtm::vector4f decayed = rtm::vector_mul(rtm::vector_sub(packed, rtm::vector_set(mid_decompress)), inv_max_value); + return decayed; + } + +#else + inline rtm::vector4f RTM_SIMD_CALL decay_vector3_uXX(rtm::vector4f_arg0 input, uint32_t num_bits) { ACL_ASSERT(rtm::vector_all_greater_equal3(input, rtm::vector_zero()) && rtm::vector_all_less_equal3(input, rtm::vector_set(1.0F)), "Expected normalized unsigned input value: %f, %f, %f", (float)rtm::vector_get_x(input), (float)rtm::vector_get_y(input), (float)rtm::vector_get_z(input)); @@ -854,19 +1160,49 @@ namespace acl return rtm::vector_neg_mul_sub(decayed, -2.0F, rtm::vector_set(-1.0F)); } +#endif + // Assumes the 'vector_data' is in big-endian order and padded in order to load up to 16 bytes from it + +#ifdef ACL_PRECISION_BOOST + + inline rtm::vector4f RTM_SIMD_CALL unpack_vector3_snXX_unsafe(uint32_t num_bits, const uint8_t* vector_data, uint32_t bit_offset) + +#else + inline rtm::vector4f RTM_SIMD_CALL unpack_vector3_uXX_unsafe(uint32_t num_bits, const uint8_t* vector_data, uint32_t bit_offset) + +#endif + { ACL_ASSERT(num_bits <= 19, "This function does not support reading more than 19 bits per component"); struct PackedTableEntry { explicit constexpr PackedTableEntry(uint8_t num_bits_) + +#ifdef ACL_PRECISION_BOOST + + : max_value(1.0F / float(1 << num_bits_)) + , mid_value((0.5F * float(1 << num_bits_)) - 0.5F) + +#else + : max_value(num_bits_ == 0 ? 1.0F : (1.0F / float((1 << num_bits_) - 1))) + +#endif + , mask((1 << num_bits_) - 1) {} float max_value; + +#ifdef ACL_PRECISION_BOOST + + float mid_value; + +#endif + uint32_t mask; }; @@ -885,6 +1221,12 @@ namespace acl const __m128i mask = _mm_castps_si128(_mm_load_ps1((const float*)&k_packed_constants[num_bits].mask)); const __m128 inv_max_value = _mm_load_ps1(&k_packed_constants[num_bits].max_value); +#ifdef ACL_PRECISION_BOOST + + const __m128 mid_value = _mm_load_ps1(&k_packed_constants[num_bits].mid_value); + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); @@ -907,12 +1249,28 @@ namespace acl __m128i int_value = _mm_set_epi32(x32, z32, y32, x32); int_value = _mm_and_si128(int_value, mask); const __m128 value = _mm_cvtepi32_ps(int_value); + +#ifdef ACL_PRECISION_BOOST + + return _mm_mul_ps(_mm_sub_ps(value, mid_value), inv_max_value); + +#else + return _mm_mul_ps(value, inv_max_value); + +#endif + #elif defined(RTM_NEON_INTRINSICS) const uint32_t bit_shift = 32 - num_bits; uint32x4_t mask = vdupq_n_u32(k_packed_constants[num_bits].mask); float inv_max_value = k_packed_constants[num_bits].max_value; +#ifdef ACL_PRECISION_BOOST + + const float32x4_t mid_value = vdupq_n_f32(k_packed_constants[num_bits].mid_value); + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); @@ -937,12 +1295,28 @@ namespace acl uint32x4_t value_u32 = vcombine_u32(xy, z); value_u32 = vandq_u32(value_u32, mask); float32x4_t value_f32 = vcvtq_f32_u32(value_u32); + +#ifdef ACL_PRECISION_BOOST + + return vmulq_n_f32(vsubq_f32(value_f32, mid_value), inv_max_value); + +#else + return vmulq_n_f32(value_f32, inv_max_value); + +#endif + #else const uint32_t bit_shift = 32 - num_bits; const uint32_t mask = k_packed_constants[num_bits].mask; const float inv_max_value = k_packed_constants[num_bits].max_value; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f mid_value = rtm::vector_set(k_packed_constants[num_bits].mid_value); + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); @@ -962,10 +1336,21 @@ namespace acl vector_u32 = byte_swap(vector_u32); const uint32_t z32 = (vector_u32 >> (bit_shift - (bit_offset % 8))) & mask; +#ifdef ACL_PRECISION_BOOST + + return rtm::vector_mul(rtm::vector_sub(rtm::vector_set(float(x32), float(y32), float(z32)), mid_value), inv_max_value); + +#else + return rtm::vector_mul(rtm::vector_set(float(x32), float(y32), float(z32)), inv_max_value); + +#endif + #endif } +#ifndef ACL_PRECISION_BOOST + // Assumes the 'vector_data' is in big-endian order and padded in order to load up to 16 bytes from it inline rtm::vector4f RTM_SIMD_CALL unpack_vector3_sXX_unsafe(uint32_t num_bits, const uint8_t* vector_data, uint32_t bit_offset) { @@ -975,9 +1360,18 @@ namespace acl return rtm::vector_neg_mul_sub(unsigned_value, -2.0F, rtm::vector_set(-1.0F)); } +#endif + ////////////////////////////////////////////////////////////////////////// // vector2 packing and decay +#ifdef ACL_PRECISION_BOOST + + // Assumes the 'vector_data' is in big-endian order and padded in order to load up to 16 bytes from it + inline rtm::vector4f RTM_SIMD_CALL unpack_vector2_snXX_unsafe(uint32_t num_bits, const uint8_t* vector_data, uint32_t bit_offset) + +#else + // Packs data in big-endian order and assumes the 'out_vector_data' is padded in order to write up to 16 bytes to it inline void RTM_SIMD_CALL pack_vector2_uXX_unsafe(rtm::vector4f_arg0 vector, uint32_t num_bits, uint8_t* out_vector_data) { @@ -993,17 +1387,38 @@ namespace acl // Assumes the 'vector_data' is in big-endian order and padded in order to load up to 16 bytes from it inline rtm::vector4f RTM_SIMD_CALL unpack_vector2_uXX_unsafe(uint32_t num_bits, const uint8_t* vector_data, uint32_t bit_offset) + +#endif + { ACL_ASSERT(num_bits <= 19, "This function does not support reading more than 19 bits per component"); struct PackedTableEntry { explicit constexpr PackedTableEntry(uint8_t num_bits_) + +#ifdef ACL_PRECISION_BOOST + + : max_value(1.0F / float(1 << num_bits_)) + , mid_value((0.5F * float(1 << num_bits_)) - 0.5F) + +#else + : max_value(num_bits_ == 0 ? 1.0F : (1.0F / float((1 << num_bits_) - 1))) + +#endif + , mask((1 << num_bits_) - 1) {} float max_value; + +#ifdef ACL_PRECISION_BOOST + + float mid_value; + +#endif + uint32_t mask; }; @@ -1022,6 +1437,12 @@ namespace acl const __m128i mask = _mm_castps_si128(_mm_load_ps1((const float*)&k_packed_constants[num_bits].mask)); const __m128 inv_max_value = _mm_load_ps1(&k_packed_constants[num_bits].max_value); +#ifdef ACL_PRECISION_BOOST + + const __m128 mid_value = _mm_load_ps1(&k_packed_constants[num_bits].mid_value); + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); @@ -1037,12 +1458,28 @@ namespace acl __m128i int_value = _mm_set_epi32(y32, x32, y32, x32); int_value = _mm_and_si128(int_value, mask); const __m128 value = _mm_cvtepi32_ps(int_value); + +#ifdef ACL_PRECISION_BOOST + + return _mm_mul_ps(_mm_sub_ps(value, mid_value), inv_max_value); + +#else + return _mm_mul_ps(value, inv_max_value); + +#endif + #elif defined(RTM_NEON_INTRINSICS) const uint32_t bit_shift = 32 - num_bits; uint32x2_t mask = vdup_n_u32(k_packed_constants[num_bits].mask); float inv_max_value = k_packed_constants[num_bits].max_value; +#ifdef ACL_PRECISION_BOOST + + const float32x2_t mid_value = vdup_n_f32(k_packed_constants[num_bits].mid_value); + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); @@ -1058,13 +1495,29 @@ namespace acl uint32x2_t xy = vcreate_u32(uint64_t(x32) | (uint64_t(y32) << 32)); xy = vand_u32(xy, mask); float32x2_t value_f32 = vcvt_f32_u32(xy); + +#ifdef ACL_PRECISION_BOOST + + float32x2_t result = vmul_n_f32(vsub_f32(value_f32, mid_value), inv_max_value); + +#else + float32x2_t result = vmul_n_f32(value_f32, inv_max_value); + +#endif + return vcombine_f32(result, result); #else const uint32_t bit_shift = 32 - num_bits; const uint32_t mask = k_packed_constants[num_bits].mask; const float inv_max_value = k_packed_constants[num_bits].max_value; +#ifdef ACL_PRECISION_BOOST + + const rtm::vector4f mid_value = rtm::vector_set(k_packed_constants[num_bits].mid_value); + +#endif + uint32_t byte_offset = bit_offset / 8; uint32_t vector_u32 = unaligned_load(vector_data + byte_offset); vector_u32 = byte_swap(vector_u32); @@ -1077,7 +1530,16 @@ namespace acl vector_u32 = byte_swap(vector_u32); const uint32_t y32 = (vector_u32 >> (bit_shift - (bit_offset % 8))) & mask; +#ifdef ACL_PRECISION_BOOST + + return rtm::vector_mul(rtm::vector_sub(rtm::vector_set(float(x32), float(y32), 0.0F, 0.0F), mid_value), inv_max_value); + +#else + return rtm::vector_mul(rtm::vector_set(float(x32), float(y32), 0.0F, 0.0F), inv_max_value); + +#endif + #endif } diff --git a/make.py b/make.py index 00c9974c..2ebb1be0 100644 --- a/make.py +++ b/make.py @@ -15,7 +15,7 @@ import zipfile # The current test/decompression data version in use -current_test_data = 'test_data_v5' +current_test_data = 'test_data_v4' # ACL_PRECISION_BOOST test_data_v5 current_decomp_data = 'decomp_data_v7' def parse_argv(): @@ -516,7 +516,7 @@ def do_prepare_regression_test_data(test_data_dir, args): regression_clips = [] for (dirpath, dirnames, filenames) in os.walk(current_test_data_dir): for filename in filenames: - if not filename.endswith('.acl'): + if not filename.endswith('.acl.sjson'): # ACL_PRECISION_BOOST .acl continue clip_filename = os.path.join(dirpath, filename) @@ -589,7 +589,7 @@ def do_prepare_decompression_test_data(test_data_dir, args): clips = [] for (dirpath, dirnames, filenames) in os.walk(current_data_dir): for filename in filenames: - if not filename.endswith('.acl'): + if not filename.endswith('.acl.sjson'): # ACL_PRECISION_BOOST .acl continue clip_filename = os.path.join(dirpath, filename) @@ -666,7 +666,7 @@ def do_regression_tests_cmake(test_data_dir, args): current_test_data_dir = os.path.join(test_data_dir, current_test_data) for (dirpath, dirnames, filenames) in os.walk(current_test_data_dir): for filename in filenames: - if not filename.endswith('.acl'): + if not filename.endswith('.acl.sjson'): # ACL_PRECISION_BOOST .acl continue clip_filename = os.path.join(dirpath, filename) diff --git a/tests/sources/math/test_quat_packing.cpp b/tests/sources/math/test_quat_packing.cpp index 9ceaa4b9..de60a61b 100644 --- a/tests/sources/math/test_quat_packing.cpp +++ b/tests/sources/math/test_quat_packing.cpp @@ -51,6 +51,8 @@ TEST_CASE("quat packing math", "[math][quat][packing]") CHECK((float)quat_get_w(quat0) == (float)quat_get_w(quat1)); } +#ifndef ACL_PRECISION_BOOST + { UnalignedBuffer tmp0; pack_quat_96(quat0, &tmp0.buffer[0]); @@ -81,6 +83,8 @@ TEST_CASE("quat packing math", "[math][quat][packing]") CHECK(scalar_near_equal((float)quat_get_w(quat0), (float)quat_get_w(quat1), 1.0E-3F)); } +#endif + CHECK(get_packed_rotation_size(rotation_format8::quatf_full) == 16); CHECK(get_packed_rotation_size(rotation_format8::quatf_drop_w_full) == 12); diff --git a/tests/sources/math/test_scalar_packing.cpp b/tests/sources/math/test_scalar_packing.cpp index 032326e9..d7e273db 100644 --- a/tests/sources/math/test_scalar_packing.cpp +++ b/tests/sources/math/test_scalar_packing.cpp @@ -41,13 +41,39 @@ static_assert((offsetof(UnalignedBuffer, buffer) % 2) == 0, "Minimum packing ali TEST_CASE("scalar packing math", "[math][scalar][packing]") { + +#ifdef ACL_PRECISION_BOOST + + for (uint8_t num_bits = 1; num_bits <= 24; ++num_bits) + { + INFO("num_bits: " << int(num_bits)); + + const float error = 1.0F / (1 << (num_bits + 1)); + INFO("error: " << error); + +#else + const float threshold = 1.0E-6F; const uint8_t max_num_bits = 23; for (uint8_t num_bits = 1; num_bits < max_num_bits; ++num_bits) { + +#endif + const uint32_t max_value = (1 << num_bits) - 1; +#ifdef ACL_PRECISION_BOOST + + CHECK(pack_scalar_signed_normalized(-0.5F, num_bits) == 0); + CHECK(pack_scalar_signed_normalized(0.5F, num_bits) == max_value); + CHECK(-0.5F < -0.5F + error); + CHECK(unpack_scalar_signed_normalized(0, num_bits) == -0.5F + error); + CHECK(0.5F > 0.5F - error); + CHECK(unpack_scalar_signed_normalized(max_value, num_bits) == 0.5F - error); + +#else + CHECK(pack_scalar_unsigned(0.0F, num_bits) == 0); CHECK(pack_scalar_unsigned(1.0F, num_bits) == max_value); CHECK(unpack_scalar_unsigned(0, num_bits) == 0.0F); @@ -58,7 +84,55 @@ TEST_CASE("scalar packing math", "[math][scalar][packing]") CHECK(unpack_scalar_signed(0, num_bits) == -1.0F); CHECK(rtm::scalar_near_equal(unpack_scalar_signed(max_value, num_bits), 1.0F, threshold)); +#endif + uint32_t num_errors = 0; + +#ifdef ACL_PRECISION_BOOST + + float prev = -1.0f; + for (uint32_t value = 0; value <= max_value; ++value) + { + const float unpacked = unpack_scalar_signed_normalized(value, num_bits); + const uint32_t packed = pack_scalar_signed_normalized(unpacked, num_bits); + if (packed != value || unpacked < -0.5F || unpacked > 0.5F) + num_errors++; + + CHECK(prev < unpacked); + if (value > 0) + { + CHECK(unpacked == prev + (2.0F * error)); + } + prev = unpacked; + + CHECK(unpacked < unpacked + error); + uint32_t result = pack_scalar_signed_normalized(std::nextafter(unpacked + error, -1.0F), num_bits); + if (result != value) + { + ++num_errors; + //INFO("value: " << int(value)); + //INFO("result: " << int(result)); + //INFO("signedNormalized: " << unpacked0); + //INFO("signedNormalizedError: " << unpacked0 + error); + //INFO("signedNormalizedErrorNext: " << std::nextafter(unpacked0 + error, -1.0F)); + //REQUIRE(false); + } + + CHECK(unpacked > unpacked - error); + result = pack_scalar_signed_normalized(unpacked - error, num_bits); + if (result != value) + { + ++num_errors; + //INFO("value: " << int(value)); + //INFO("result: " << int(result)); + //INFO("signedNormalized: " << unpacked0); + //INFO("signedNormalizedError: " << unpacked0 - error); + //REQUIRE(false); + } + } + +#else + for (uint32_t value = 0; value < max_value; ++value) { const float unpacked0 = unpack_scalar_unsigned(value, num_bits); @@ -71,11 +145,23 @@ TEST_CASE("scalar packing math", "[math][scalar][packing]") if (packed1 != value || unpacked1 < -1.0F || unpacked1 > 1.0F) num_errors++; } + +#endif + CHECK(num_errors == 0); } } +#ifdef ACL_PRECISION_BOOST + +TEST_CASE("unpack_scalarf_32_unsafe", "[math][scalar][packing]") + +#else + TEST_CASE("unpack_scalarf_96_unsafe", "[math][scalar][packing]") + +#endif + { { UnalignedBuffer tmp0; @@ -99,35 +185,120 @@ TEST_CASE("unpack_scalarf_96_unsafe", "[math][scalar][packing]") memcpy_bits(&tmp1.buffer[0], offset, &tmp0.buffer[0], 0, 32); scalarf scalar1 = unpack_scalarf_32_unsafe(&tmp1.buffer[0], offset); + +#ifdef ACL_PRECISION_BOOST + + if (!scalar_equal(vector_get_x(vec0), scalar_cast(scalar1))) + +#else + if (!scalar_near_equal(vector_get_x(vec0), scalar_cast(scalar1), 1.0E-6F)) + +#endif + num_errors++; memcpy_bits(&tmp1.buffer[0], offset, &tmp0.buffer[4], 0, 32); scalar1 = unpack_scalarf_32_unsafe(&tmp1.buffer[0], offset); + +#ifdef ACL_PRECISION_BOOST + + if (!scalar_equal(vector_get_y(vec0), scalar_cast(scalar1))) + +#else + if (!scalar_near_equal(vector_get_y(vec0), scalar_cast(scalar1), 1.0E-6F)) + +#endif + num_errors++; } CHECK(num_errors == 0); } } +#ifdef ACL_PRECISION_BOOST + +TEST_CASE("unpack_scalarf_snXX_unsafe", "[math][scalar][packing]") + +#else + TEST_CASE("unpack_scalarf_uXX_unsafe", "[math][scalar][packing]") + +#endif + { { UnalignedBuffer tmp0; alignas(16) uint8_t buffer[64]; uint32_t num_errors = 0; + +#ifdef ACL_PRECISION_BOOST + + vector4f vec0; + scalarf scalar1; + scalarf scalar2; + +#else + vector4f vec0 = vector_set(unpack_scalar_unsigned(0, 16), unpack_scalar_unsigned(12355, 16), unpack_scalar_unsigned(43222, 16), unpack_scalar_unsigned(54432, 16)); pack_vector2_uXX_unsafe(vec0, 16, &buffer[0]); scalarf scalar1 = unpack_scalarf_uXX_unsafe(16, &buffer[0], 0); if (!scalar_near_equal(vector_get_x(vec0), scalar_cast(scalar1), 1.0E-6F)) num_errors++; +#endif + for (uint8_t bit_rate = 1; bit_rate < k_highest_bit_rate; ++bit_rate) { uint32_t num_bits = get_num_bits_at_bit_rate(bit_rate); uint32_t max_value = (1 << num_bits) - 1; + +#ifdef ACL_PRECISION_BOOST + + INFO("num_bits: " << int(num_bits)); + const float error = 1.0F / (1 << (num_bits + 1)); + INFO("error: " << error); + + vec0 = vector_set(-0.5F, 0.5F, -0.5f + error, 0.5F - error); + pack_vector4_snXX_unsafe(vec0, num_bits, &buffer[0]); + scalar1 = unpack_scalarf_snXX_unsafe(num_bits, &buffer[0], 0); + if (!scalar_equal(vector_get_z(vec0), scalar1)) + { + ++num_errors; + } + + scalar2 = unpack_scalarf_snXX_unsafe(num_bits, &buffer[0], num_bits); + if (!scalar_equal(vector_get_w(vec0), scalar2)) + { + ++num_errors; + } + + for (uint32_t value = 0; value <= max_value; ++value) + { + vec0 = vector_set(unpack_scalar_signed_normalized(value, num_bits), 0.0F, 0.0F, 0.0F); + pack_vector4_snXX_unsafe(vec0, num_bits, &buffer[0]); + scalar1 = unpack_scalarf_snXX_unsafe(num_bits, &buffer[0], 0); + + pack_vector4_snXX_unsafe(vector_set(std::nextafter(vector_get_x(vec0) + error, -1.0F)), num_bits, &buffer[0]); + scalar2 = unpack_scalarf_snXX_unsafe(num_bits, &buffer[0], 0); + if (!scalar_equal(scalar1, scalar2)) + { + ++num_errors; + } + + pack_vector4_snXX_unsafe(vector_set(vector_get_x(vec0) - error), num_bits, &buffer[0]); + scalar2 = unpack_scalarf_snXX_unsafe(num_bits, &buffer[0], 0); + if (!scalar_equal(scalar1, scalar2)) + { + ++num_errors; + } + + if (!scalar_equal(vector_get_x(vec0), scalar1)) + +#else + for (uint32_t value = 0; value <= max_value; ++value) { const float value_unsigned = scalar_clamp(unpack_scalar_unsigned(value, num_bits), 0.0F, 1.0F); @@ -136,6 +307,9 @@ TEST_CASE("unpack_scalarf_uXX_unsafe", "[math][scalar][packing]") pack_vector2_uXX_unsafe(vec0, num_bits, &buffer[0]); scalar1 = unpack_scalarf_uXX_unsafe(num_bits, &buffer[0], 0); if (!scalar_near_equal(vector_get_x(vec0), scalar_cast(scalar1), 1.0E-6F)) + +#endif + num_errors++; { @@ -145,8 +319,19 @@ TEST_CASE("unpack_scalarf_uXX_unsafe", "[math][scalar][packing]") const uint8_t offset = offsets[offset_idx]; memcpy_bits(&tmp0.buffer[0], offset, &buffer[0], 0, size_t(num_bits) * 4); + +#ifdef ACL_PRECISION_BOOST + + scalar1 = unpack_scalarf_snXX_unsafe(num_bits, &tmp0.buffer[0], offset); + if (!scalar_equal(vector_get_x(vec0), scalar1)) + +#else + scalar1 = unpack_scalarf_uXX_unsafe(num_bits, &tmp0.buffer[0], offset); if (!scalar_near_equal(vector_get_x(vec0), scalar_cast(scalar1), 1.0E-6F)) + +#endif + num_errors++; } } diff --git a/tests/sources/math/test_vector4_packing.cpp b/tests/sources/math/test_vector4_packing.cpp index 38f80c2b..97400f4a 100644 --- a/tests/sources/math/test_vector4_packing.cpp +++ b/tests/sources/math/test_vector4_packing.cpp @@ -82,13 +82,25 @@ TEST_CASE("pack_vector4_128", "[math][vector4][packing]") memcpy_bits(&tmp1.buffer[0], offset, &tmp0.buffer[0], 0, 128); vector4f vec1 = unpack_vector4_128_unsafe(&tmp1.buffer[0], offset); + +#ifdef ACL_PRECISION_BOOST + + if (!vector_all_near_equal(vec0, vec1, 0.0F)) + +#else + if (!vector_all_near_equal(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; } CHECK(num_errors == 0); } } +#ifndef ACL_PRECISION_BOOST + TEST_CASE("pack_vector4_64", "[math][vector4][packing]") { { @@ -141,6 +153,8 @@ TEST_CASE("pack_vector4_32", "[math][vector4][packing]") } } +#endif + TEST_CASE("pack_vector4_XX", "[math][vector4][packing]") { { @@ -148,16 +162,74 @@ TEST_CASE("pack_vector4_XX", "[math][vector4][packing]") alignas(16) uint8_t buffer[64]; uint32_t num_errors = 0; + +#ifdef ACL_PRECISION_BOOST + + vector4f vec0; + vector4f vec1; + vector4f vec2; + +#else + vector4f vec0 = vector_set(unpack_scalar_unsigned(0, 16), unpack_scalar_unsigned(12355, 16), unpack_scalar_unsigned(43222, 16), unpack_scalar_unsigned(54432, 16)); pack_vector4_uXX_unsafe(vec0, 16, &buffer[0]); vector4f vec1 = unpack_vector4_uXX_unsafe(16, &buffer[0], 0); if (!vector_all_near_equal(vec0, vec1, 1.0E-6F)) num_errors++; +#endif + for (uint8_t bit_rate = 1; bit_rate < k_highest_bit_rate; ++bit_rate) { uint32_t num_bits = get_num_bits_at_bit_rate(bit_rate); uint32_t max_value = (1 << num_bits) - 1; + +#ifdef ACL_PRECISION_BOOST + + INFO("num_bits: " << int(num_bits)); + const float error = 1.0F / (1 << (num_bits + 1)); + INFO("error: " << error); + + vec0 = vector_set(-0.5F, 0.5F, -0.5F, 0.5F); + vec2 = vector_set(-0.5f + error, 0.5F - error, -0.5f + error, 0.5F - error); + pack_vector4_snXX_unsafe(vec0, num_bits, &buffer[0]); + vec1 = unpack_vector4_snXX_unsafe(num_bits, &buffer[0], 0); + if (!vector_all_near_equal(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + for (uint32_t value = 0; value <= max_value; value += 4) + { + vec0 = vector_set( + unpack_scalar_signed_normalized(value, num_bits), + unpack_scalar_signed_normalized(std::min(value + 1, max_value), num_bits), + unpack_scalar_signed_normalized(std::min(value + 2, max_value), num_bits), + unpack_scalar_signed_normalized(std::min(value + 3, max_value), num_bits)); + pack_vector4_snXX_unsafe(vec0, num_bits, &buffer[0]); + vec1 = unpack_vector4_snXX_unsafe(num_bits, &buffer[0], 0); + pack_vector4_snXX_unsafe(vector_set( + std::nextafter(vector_get_x(vec0) + error, -1.0F), + std::nextafter(vector_get_y(vec0) + error, -1.0F), + std::nextafter(vector_get_z(vec0) + error, -1.0F), + std::nextafter(vector_get_w(vec0) + error, -1.0F)), num_bits, &buffer[0]); + vec2 = unpack_vector4_snXX_unsafe(num_bits, &buffer[0], 0); + if (!vector_all_near_equal(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + pack_vector4_snXX_unsafe(vector_sub(vec0, vector_set(error)), num_bits, &buffer[0]); + vec2 = unpack_vector4_snXX_unsafe(num_bits, &buffer[0], 0); + if (!vector_all_near_equal(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + if (!vector_all_near_equal(vec0, vec1, 0.0F)) + +#else + for (uint32_t value = 0; value <= max_value; ++value) { const float value_unsigned = scalar_clamp(unpack_scalar_unsigned(value, num_bits), 0.0F, 1.0F); @@ -166,6 +238,9 @@ TEST_CASE("pack_vector4_XX", "[math][vector4][packing]") pack_vector4_uXX_unsafe(vec0, num_bits, &buffer[0]); vec1 = unpack_vector4_uXX_unsafe(num_bits, &buffer[0], 0); if (!vector_all_near_equal(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; { @@ -175,8 +250,19 @@ TEST_CASE("pack_vector4_XX", "[math][vector4][packing]") const uint8_t offset = offsets[offset_idx]; memcpy_bits(&tmp0.buffer[0], offset, &buffer[0], 0, size_t(num_bits) * 4); + +#ifdef ACL_PRECISION_BOOST + + vec1 = unpack_vector4_snXX_unsafe(num_bits, &tmp0.buffer[0], offset); + if (!vector_all_near_equal(vec0, vec1, 0.0F)) + +#else + vec1 = unpack_vector4_uXX_unsafe(num_bits, &tmp0.buffer[0], offset); if (!vector_all_near_equal(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; } } @@ -228,6 +314,50 @@ TEST_CASE("pack_vector3_48", "[math][vector4][packing]") { UnalignedBuffer tmp0; uint32_t num_errors = 0; + +#ifdef ACL_PRECISION_BOOST + + vector4f vec0; + vector4f vec1; + vector4f vec2; + const vector4f vec_max_error = vector_set(1.0F - std::nextafter(65534.5F / 65535.0F, 1.0F)); + const float error = std::min(1.0E-6F, 1.0F / (1 << 17)); + + vec0 = vector_set(-0.5F, 0.5F, -0.5F, 0.0F); + vec2 = vector_set(-0.5F, 0.5F, -0.5F, 0.0F); + pack_vector3_sn48_unsafe_precise_endpoints(vec0, &tmp0.buffer[0]); + vec1 = unpack_vector3_sn48_unsafe_precise_endpoints(&tmp0.buffer[0]); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + for (uint32_t value = 0; value < 65536; value += 3) + { + vec0 = vector_min(vector_add(vector_set(-0.5F), vector_set( + float(value) / 65535, + float(value + 1) / 65535, + float(value + 2) / 65535)), vector_set(0.5F)); + pack_vector3_sn48_unsafe_precise_endpoints(vec0, &tmp0.buffer[0]); + vec1 = unpack_vector3_sn48_unsafe_precise_endpoints(&tmp0.buffer[0]); + pack_vector3_sn48_unsafe_precise_endpoints(vector_max(vector_sub(vec0, vec_max_error), vector_set(-0.5F)), &tmp0.buffer[0]); + vec2 = unpack_vector3_sn48_unsafe_precise_endpoints(&tmp0.buffer[0]); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + pack_vector3_sn48_unsafe_precise_endpoints(vector_min(vector_add(vec0, vec_max_error), vector_set(0.5F)), &tmp0.buffer[0]); + vec2 = unpack_vector3_sn48_unsafe_precise_endpoints(&tmp0.buffer[0]); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + if (!vector_all_near_equal3(vec0, vec1, error)) + +#else + for (uint32_t value = 0; value < 65536; ++value) { const float value_signed = unpack_scalar_signed(value, 16); @@ -243,6 +373,9 @@ TEST_CASE("pack_vector3_48", "[math][vector4][packing]") pack_vector3_u48_unsafe(vec0, &tmp0.buffer[0]); vec1 = unpack_vector3_u48_unsafe(&tmp0.buffer[0]); if (!vector_all_near_equal3(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; } CHECK(num_errors == 0); @@ -253,6 +386,55 @@ TEST_CASE("decay_vector3_48", "[math][vector4][decay]") { { uint32_t num_errors = 0; + +#ifdef ACL_PRECISION_BOOST + + vector4f vec0; + vector4f vec1; + vector4f vec2; + const vector4f vec_max_error = vector_set(1.0F - std::nextafter(65534.5F/65535.0F, 1.0F)); + const float error = std::min(1.0E-6F, 1.0F / (1 << 17)); + + for (uint32_t value = 0; value < 65536; value += 3) + { + vec0 = vector_min(vector_add(vector_set(-0.5F), vector_set( + float(value) / 65535, + float(value + 1) / 65535, + float(value + 2) / 65535)), vector_set(0.5F)); + + vec1 = decay_vector3_sn48_precise_endpoints(vec0); + vec2 = decay_vector3_sn48_precise_endpoints(vector_max(vector_sub(vec0, vec_max_error), vector_set(-0.5F))); + if (!(vector_all_near_equal3(vec1, vec2, 0.0F))) + { + ++num_errors; + //INFO("value: " << (int)value); + //INFO("vec1: " << vector_get_x(vec1) << " " << vector_get_y(vec1) << " " << vector_get_z(vec1)); + //INFO("vec2: " << vector_get_x(vec2) << " " << vector_get_y(vec2) << " " << vector_get_z(vec2)); + //CHECK(false); + } + + vec2 = decay_vector3_sn48_precise_endpoints(vector_min(vector_add(vec0, vec_max_error), vector_set(0.5F))); + if (!(vector_all_near_equal3(vec1, vec2, 0.0F))) + { + ++num_errors; + //INFO("value: " << (int)value); + //INFO("vec1: " << vector_get_x(vec1) << " " << vector_get_y(vec1) << " " << vector_get_z(vec1)); + //INFO("vec2: " << vector_get_x(vec2) << " " << vector_get_y(vec2) << " " << vector_get_z(vec2)); + //CHECK(false); + } + + if (!(vector_all_near_equal3(vec0, vec1, error) && vector_all_greater_equal3(vec1, vector_set(-0.5F)) && vector_all_less_equal3(vec1, vector_set(0.5F)))) + { + ++num_errors; + //INFO("value: " << (int)value); + //INFO("vec0: " << vector_get_x(vec0) << " " << vector_get_y(vec0) << " " << vector_get_z(vec0)); + //INFO("vec1: " << vector_get_x(vec1) << " " << vector_get_y(vec1) << " " << vector_get_z(vec1)); + //CHECK(false); + } + } + +#else + for (uint32_t value = 0; value < 65536; ++value) { const float value_signed = unpack_scalar_signed(value, 16); @@ -268,10 +450,15 @@ TEST_CASE("decay_vector3_48", "[math][vector4][decay]") if (!vector_all_near_equal3(vec0, vec1, 1.0E-6F)) num_errors++; } + +#endif + CHECK(num_errors == 0); } } +#ifndef ACL_PRECISION_BOOST + TEST_CASE("pack_vector3_32", "[math][vector4][packing]") { { @@ -337,11 +524,93 @@ TEST_CASE("decay_vector3_32", "[math][vector4][decay]") } } +#endif + TEST_CASE("pack_vector3_24", "[math][vector4][packing]") { { UnalignedBuffer tmp0; uint32_t num_errors = 0; + +#ifdef ACL_PRECISION_BOOST + + vector4f vec0; + vector4f vec1; + vector4f vec2; + const vector4f vec_max_error = vector_set(1.0F - std::nextafter(253.5F / 254.0F, 1.0F)); + const float error = std::min(1.0E-6F, 1.0F / (1 << 9)); + + vec0 = vector_set(-0.5F, 0.0F, 0.5F, 0.0F); + vec2 = vector_set(-0.5F, 0.0F, 0.5F, 0.0F); + pack_vector3_sn24_unsafe_precise_endpoints_midpoint(vec0, &tmp0.buffer[0]); + vec1 = unpack_vector3_sn24_unsafe_precise_endpoints_midpoint(&tmp0.buffer[0]); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + vec0 = vector_set(0.0F, 0.5F, 1.0F, 0.0F); + vec2 = vector_set(0.0F, 0.5F, 1.0F, 0.0F); + pack_vector3_u24_unsafe_precise_endpoints_midpoint(vec0, &tmp0.buffer[0]); + vec1 = unpack_vector3_u24_unsafe_precise_endpoints_midpoint(&tmp0.buffer[0]); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + for (uint32_t value = 0; value < 255; value += 3) + { + vec0 = vector_min(vector_add(vector_set(-0.5F), vector_set( + float(value) / 254, + float(value + 1) / 254, + float(value + 2) / 254)), vector_set(0.5F)); + pack_vector3_sn24_unsafe_precise_endpoints_midpoint(vec0, &tmp0.buffer[0]); + vec1 = unpack_vector3_sn24_unsafe_precise_endpoints_midpoint(&tmp0.buffer[0]); + + pack_vector3_sn24_unsafe_precise_endpoints_midpoint(vector_max(vector_sub(vec0, vec_max_error), vector_set(-0.5F)), &tmp0.buffer[0]); + vec2 = unpack_vector3_sn24_unsafe_precise_endpoints_midpoint(&tmp0.buffer[0]); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + pack_vector3_sn24_unsafe_precise_endpoints_midpoint(vector_min(vector_add(vec0, vec_max_error), vector_set(0.5F)), &tmp0.buffer[0]); + vec2 = unpack_vector3_sn24_unsafe_precise_endpoints_midpoint(&tmp0.buffer[0]); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + if (!vector_all_near_equal3(vec0, vec1, error)) + { + ++num_errors; + } + + vec0 = vector_min(vector_set( + float(value) / 254, + float(value + 1) / 254, + float(value + 2) / 254), vector_set(1.0F)); + pack_vector3_u24_unsafe_precise_endpoints_midpoint(vec0, &tmp0.buffer[0]); + vec1 = unpack_vector3_u24_unsafe_precise_endpoints_midpoint(&tmp0.buffer[0]); + + pack_vector3_u24_unsafe_precise_endpoints_midpoint(vector_max(vector_sub(vec0, vec_max_error), vector_set(0.0F)), &tmp0.buffer[0]); + vec2 = unpack_vector3_u24_unsafe_precise_endpoints_midpoint(&tmp0.buffer[0]); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + pack_vector3_u24_unsafe_precise_endpoints_midpoint(vector_min(vector_add(vec0, vec_max_error), vector_set(1.0F)), &tmp0.buffer[0]); + vec2 = unpack_vector3_u24_unsafe_precise_endpoints_midpoint(&tmp0.buffer[0]); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + if (!vector_all_near_equal3(vec0, vec1, error)) + +#else + for (uint32_t value = 0; value < 256; ++value) { const float value_signed = scalar_min(unpack_scalar_signed(value, 8), 1.0F); @@ -357,6 +626,9 @@ TEST_CASE("pack_vector3_24", "[math][vector4][packing]") pack_vector3_u24_unsafe(vec0, &tmp0.buffer[0]); vec1 = unpack_vector3_u24_unsafe(&tmp0.buffer[0]); if (!vector_all_near_equal3(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; } CHECK(num_errors == 0); @@ -370,6 +642,15 @@ TEST_CASE("pack_vector3_XX", "[math][vector4][packing]") alignas(16) uint8_t buffer[64]; uint32_t num_errors = 0; + +#ifdef ACL_PRECISION_BOOST + + vector4f vec0; + vector4f vec1; + vector4f vec2; + +#else + vector4f vec0 = vector_set(unpack_scalar_signed(0, 16), unpack_scalar_signed(12355, 16), unpack_scalar_signed(43222, 16)); pack_vector3_sXX_unsafe(vec0, 16, &buffer[0]); vector4f vec1 = unpack_vector3_sXX_unsafe(16, &buffer[0], 0); @@ -382,10 +663,59 @@ TEST_CASE("pack_vector3_XX", "[math][vector4][packing]") if (!vector_all_near_equal3(vec0, vec1, 1.0E-6F)) num_errors++; +#endif + for (uint8_t bit_rate = 1; bit_rate < k_highest_bit_rate; ++bit_rate) { uint32_t num_bits = get_num_bits_at_bit_rate(bit_rate); uint32_t max_value = (1 << num_bits) - 1; + +#ifdef ACL_PRECISION_BOOST + + INFO("num_bits: " << int(num_bits)); + const float error = 1.0F / (1 << (num_bits + 1)); + INFO("error: " << error); + + vec0 = vector_set(-0.5F, 0.5F, -0.5F, 0.0F); + vec2 = vector_set(-0.5f + error, 0.5F - error, -0.5f + error, 0.0F); + pack_vector3_snXX_unsafe(vec0, num_bits, &buffer[0]); + vec1 = unpack_vector3_snXX_unsafe(num_bits, &buffer[0], 0); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + for (uint32_t value = 0; value <= max_value; value += 3) + { + vec0 = vector_set( + unpack_scalar_signed_normalized(value, num_bits), + unpack_scalar_signed_normalized(std::min(value + 1, max_value), num_bits), + unpack_scalar_signed_normalized(std::min(value + 2, max_value), num_bits), + 0.0F); + pack_vector3_snXX_unsafe(vec0, num_bits, &buffer[0]); + vec1 = unpack_vector3_snXX_unsafe(num_bits, &buffer[0], 0); + pack_vector3_snXX_unsafe(vector_set( + std::nextafter(vector_get_x(vec0) + error, -1.0F), + std::nextafter(vector_get_y(vec0) + error, -1.0F), + std::nextafter(vector_get_z(vec0) + error, -1.0F), + 0.0F), num_bits, &buffer[0]); + vec2 = unpack_vector3_snXX_unsafe(num_bits, &buffer[0], 0); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + pack_vector3_snXX_unsafe(vector_sub(vec0, vector_set(error)), num_bits, &buffer[0]); + vec2 = unpack_vector3_snXX_unsafe(num_bits, &buffer[0], 0); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + if (!vector_all_near_equal3(vec0, vec1, 0.0F)) + +#else + for (uint32_t value = 0; value <= max_value; ++value) { const float value_signed = scalar_clamp(unpack_scalar_signed(value, num_bits), -1.0F, 1.0F); @@ -395,6 +725,9 @@ TEST_CASE("pack_vector3_XX", "[math][vector4][packing]") pack_vector3_uXX_unsafe(vec0, num_bits, &buffer[0]); vec1 = unpack_vector3_uXX_unsafe(num_bits, &buffer[0], 0); if (!vector_all_near_equal3(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; { @@ -404,12 +737,25 @@ TEST_CASE("pack_vector3_XX", "[math][vector4][packing]") const uint8_t offset = offsets[offset_idx]; memcpy_bits(&tmp0.buffer[0], offset, &buffer[0], 0, size_t(num_bits) * 3); + +#ifdef ACL_PRECISION_BOOST + + vec1 = unpack_vector3_snXX_unsafe(num_bits, &tmp0.buffer[0], offset); + if (!vector_all_near_equal3(vec0, vec1, 0.0F)) + +#else + vec1 = unpack_vector3_uXX_unsafe(num_bits, &tmp0.buffer[0], offset); if (!vector_all_near_equal3(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; } } +#ifndef ACL_PRECISION_BOOST + vec0 = vector_set(value_signed, value_signed, value_signed); pack_vector3_sXX_unsafe(vec0, num_bits, &buffer[0]); vec1 = unpack_vector3_sXX_unsafe(num_bits, &buffer[0], 0); @@ -428,6 +774,9 @@ TEST_CASE("pack_vector3_XX", "[math][vector4][packing]") num_errors++; } } + +#endif + } } CHECK(num_errors == 0); @@ -439,6 +788,14 @@ TEST_CASE("decay_vector3_XX", "[math][vector4][decay]") { uint32_t num_errors = 0; +#ifdef ACL_PRECISION_BOOST + + vector4f vec0; + vector4f vec1; + vector4f vec2; + +#else + vector4f vec0 = vector_set(unpack_scalar_signed(0, 16), unpack_scalar_signed(12355, 16), unpack_scalar_signed(43222, 16)); vector4f vec1 = decay_vector3_sXX(vec0, 16); if (!vector_all_near_equal3(vec0, vec1, 1.0E-6F)) @@ -449,10 +806,55 @@ TEST_CASE("decay_vector3_XX", "[math][vector4][decay]") if (!vector_all_near_equal3(vec0, vec1, 1.0E-6F)) num_errors++; +#endif + for (uint8_t bit_rate = 1; bit_rate < k_highest_bit_rate; ++bit_rate) { uint32_t num_bits = get_num_bits_at_bit_rate(bit_rate); uint32_t max_value = (1 << num_bits) - 1; + +#ifdef ACL_PRECISION_BOOST + + INFO("num_bits: " << int(num_bits)); + const float error = 1.0F / (1 << (num_bits + 1)); + INFO("error: " << error); + + vec0 = vector_set(-0.5F, 0.5F, -0.5F, 0.0F); + vec2 = vector_set(-0.5f + error, 0.5F - error, -0.5f + error, 0.0F); + vec1 = decay_vector3_snXX(vec0, num_bits); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + for (uint32_t value = 0; value <= max_value; value += 3) + { + vec0 = vector_set( + unpack_scalar_signed_normalized(value, num_bits), + unpack_scalar_signed_normalized(std::min(value + 1, max_value), num_bits), + unpack_scalar_signed_normalized(std::min(value + 2, max_value), num_bits), + 0.0F); + vec1 = decay_vector3_snXX(vec0, num_bits); + vec2 = decay_vector3_snXX(vector_set( + std::nextafter(vector_get_x(vec0) + error, -1.0F), + std::nextafter(vector_get_y(vec0) + error, -1.0F), + std::nextafter(vector_get_z(vec0) + error, -1.0F), + 0.0F), num_bits); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + vec2 = decay_vector3_snXX(vector_sub(vec0, vector_set(error)), num_bits); + if (!vector_all_near_equal3(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + if (!vector_all_near_equal3(vec0, vec1, 0.0F)) + +#else + for (uint32_t value = 0; value <= max_value; ++value) { const float value_signed = scalar_clamp(unpack_scalar_signed(value, num_bits), -1.0F, 1.0F); @@ -466,6 +868,9 @@ TEST_CASE("decay_vector3_XX", "[math][vector4][decay]") vec0 = vector_set(value_unsigned, value_unsigned, value_unsigned); vec1 = decay_vector3_uXX(vec0, num_bits); if (!vector_all_near_equal3(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; } } @@ -498,7 +903,17 @@ TEST_CASE("pack_vector2_64", "[math][vector4][packing]") memcpy_bits(&tmp1.buffer[0], offset, &tmp0.buffer[0], 0, 64); vector4f vec1 = unpack_vector2_64_unsafe(&tmp1.buffer[0], offset); + +#ifdef ACL_PRECISION_BOOST + + if (!vector_all_near_equal2(vec0, vec1, 0.0F)) + +#else + if (!vector_all_near_equal2(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; } CHECK(num_errors == 0); @@ -512,16 +927,74 @@ TEST_CASE("pack_vector2_XX", "[math][vector4][packing]") alignas(16) uint8_t buffer[64]; uint32_t num_errors = 0; + +#ifdef ACL_PRECISION_BOOST + + vector4f vec0; + vector4f vec1; + vector4f vec2; + +#else + vector4f vec0 = vector_set(unpack_scalar_unsigned(0, 16), unpack_scalar_unsigned(12355, 16), unpack_scalar_unsigned(43222, 16), unpack_scalar_unsigned(54432, 16)); pack_vector2_uXX_unsafe(vec0, 16, &buffer[0]); vector4f vec1 = unpack_vector2_uXX_unsafe(16, &buffer[0], 0); if (!vector_all_near_equal2(vec0, vec1, 1.0E-6F)) num_errors++; +#endif + for (uint8_t bit_rate = 1; bit_rate < k_highest_bit_rate; ++bit_rate) { uint32_t num_bits = get_num_bits_at_bit_rate(bit_rate); uint32_t max_value = (1 << num_bits) - 1; + +#ifdef ACL_PRECISION_BOOST + + INFO("num_bits: " << int(num_bits)); + const float error = 1.0F / (1 << (num_bits + 1)); + INFO("error: " << error); + + vec0 = vector_set(-0.5F, 0.5F, -0.5F, 0.5F); + vec2 = vector_set(-0.5f + error, 0.5F - error, -0.5f + error, 0.5F - error); + pack_vector4_snXX_unsafe(vec0, num_bits, &buffer[0]); + vec1 = unpack_vector2_snXX_unsafe(num_bits, &buffer[0], 0); + if (!vector_all_near_equal2(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + for (uint32_t value = 0; value <= max_value; value += 2) + { + vec0 = vector_set( + unpack_scalar_signed_normalized(value, num_bits), + unpack_scalar_signed_normalized(std::min(value + 1, max_value), num_bits), + 0.0F, + 0.0F); + pack_vector4_snXX_unsafe(vec0, num_bits, &buffer[0]); + vec1 = unpack_vector2_snXX_unsafe(num_bits, &buffer[0], 0); + pack_vector4_snXX_unsafe(vector_set( + std::nextafter(vector_get_x(vec0) + error, -1.0F), + std::nextafter(vector_get_y(vec0) + error, -1.0F), + 0.0F, + 0.0F), num_bits, &buffer[0]); + vec2 = unpack_vector2_snXX_unsafe(num_bits, &buffer[0], 0); + if (!vector_all_near_equal2(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + pack_vector4_snXX_unsafe(vector_sub(vec0, vector_set(error)), num_bits, &buffer[0]); + vec2 = unpack_vector2_snXX_unsafe(num_bits, &buffer[0], 0); + if (!vector_all_near_equal2(vec1, vec2, 0.0F)) + { + ++num_errors; + } + + if (!vector_all_near_equal2(vec0, vec1, 0.0F)) + +#else + for (uint32_t value = 0; value <= max_value; ++value) { const float value_unsigned = scalar_clamp(unpack_scalar_unsigned(value, num_bits), 0.0F, 1.0F); @@ -530,6 +1003,9 @@ TEST_CASE("pack_vector2_XX", "[math][vector4][packing]") pack_vector2_uXX_unsafe(vec0, num_bits, &buffer[0]); vec1 = unpack_vector2_uXX_unsafe(num_bits, &buffer[0], 0); if (!vector_all_near_equal2(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; { @@ -539,8 +1015,19 @@ TEST_CASE("pack_vector2_XX", "[math][vector4][packing]") const uint8_t offset = offsets[offset_idx]; memcpy_bits(&tmp0.buffer[0], offset, &buffer[0], 0, size_t(num_bits) * 4); + +#ifdef ACL_PRECISION_BOOST + + vec1 = unpack_vector2_snXX_unsafe(num_bits, &tmp0.buffer[0], offset); + if (!vector_all_near_equal2(vec0, vec1, 0.0F)) + +#else + vec1 = unpack_vector2_uXX_unsafe(num_bits, &tmp0.buffer[0], offset); if (!vector_all_near_equal2(vec0, vec1, 1.0E-6F)) + +#endif + num_errors++; } }