Skip to content

Commit

Permalink
StringBuilder append pattern for float/double.
Browse files Browse the repository at this point in the history
Results for added benchmarks on blueline-userdebug with cpu
frequencies fxed at 1420800 (cpus 0-3; little) and 1459200
(cpus 4-7; big):
32-bit little (--variant=X32 --invoke-with 'taskset 0f')
  timeAppendStringAndDouble: ~1260ns -> ~970ns
  timeAppendStringAndFloat: ~1250ns -> ~940ns
  timeAppendStringAndHugeDouble: ~4700ns -> ~4690ns (noise)
  timeAppendStringAndHugeFloat: ~3400ns -> ~3300ns (noise)
  timeAppendStringDoubleStringAndFloat: ~1980ns -> ~1550ns
64-bit little (--variant=X64 --invoke-with 'taskset 0f')
  timeAppendStringAndDouble: ~1260ns -> ~970ns
  timeAppendStringAndFloat: ~1260ns -> ~940ns
  timeAppendStringAndHugeDouble: ~4700ns -> ~4800ns (noise)
  timeAppendStringAndHugeFloat: ~3300ns -> ~3400ns (noise)
  timeAppendStringDoubleStringAndFloat: ~1970ns -> ~1550ns
32-bit big (--variant=X32 --invoke-with 'taskset f0')
  timeAppendStringAndDouble: ~580ns -> ~450ns
  timeAppendStringAndFloat: ~590ns -> ~430ns
  timeAppendStringAndHugeDouble: ~2500ns -> ~2100ns (noise)
  timeAppendStringAndHugeFloat: ~1500ns -> ~1300ns (noise)
  timeAppendStringDoubleStringAndFloat: ~880ns -> ~730ns
64-bit big (--variant=X64 --invoke-with 'taskset f0')
  timeAppendStringAndDouble: ~590ns -> ~450ns
  timeAppendStringAndFloat: ~590ns -> ~430ns
  timeAppendStringAndHugeDouble: ~2300ns -> ~2300ns (noise)
  timeAppendStringAndHugeFloat: ~1500ns -> ~1300ns (noise)
  timeAppendStringDoubleStringAndFloat: ~870ns -> ~730ns

The `timeAppendStringAnd{Double,Float)` benchmarks show very
nice improvements, roughly 25% on both little and big cores.
The `timeAppendStringDoubleStringAndFloat` also shows decent
improvements, over 20% on little and over 15% on big cores.
(These benchmarks test the best-case scenario for "before"
as the StringBuilder's internal buffer is not reallocated.)

The `testAppendStringAndHuge{Double,Float}` results are too
noisy to draw any conclusions (especially on little cores
but there is still too much noise on big cores as well).

There are also small regressions for existing benchmarks
`timeAppend{LongStrings,StringAndInt,Strings}` but these
non-FP regressions may be mitigated after updating the
ThinLTO profile.

There is also an opportunity to optimize the calls back
to managed code for known shorty (in this change we use
"LD" and "LF") by using a dedicated stub instead of going
through the generic invoke stub.

Boot image size changes are insignificant (few matches).

Test: Added tests to 697-checker-string-append
Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: testrunner.py --target --optimizing
Bug: 19575890
Change-Id: I9cf38c2d615a0a2b14255d18588a694d8870aae5
  • Loading branch information
vmarko committed Jan 3, 2023
1 parent 890b19b commit 41de450
Show file tree
Hide file tree
Showing 9 changed files with 451 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ public class StringBuilderAppendBenchmark {
public static String longString1 = "This is a long string 1";
public static String longString2 = "This is a long string 2";
public static int int1 = 42;
public static double double1 = 42.0;
public static double double2 = 1.0E308;
public static float float1 = 42.0f;
public static float float2 = 1.0E38f;

public void timeAppendStrings(int count) {
String s1 = string1;
Expand Down Expand Up @@ -59,4 +63,74 @@ public void timeAppendStringAndInt(int count) {
throw new AssertionError();
}
}

public void timeAppendStringAndDouble(int count) {
String s1 = string1;
double d1 = double1;
int sum = 0;
for (int i = 0; i < count; ++i) {
String result = s1 + d1;
sum += result.length(); // Make sure the append is not optimized away.
}
if (sum != count * (s1.length() + Double.toString(d1).length())) {
throw new AssertionError();
}
}

public void timeAppendStringAndHugeDouble(int count) {
String s1 = string1;
double d2 = double2;
int sum = 0;
for (int i = 0; i < count; ++i) {
String result = s1 + d2;
sum += result.length(); // Make sure the append is not optimized away.
}
if (sum != count * (s1.length() + Double.toString(d2).length())) {
throw new AssertionError();
}
}

public void timeAppendStringAndFloat(int count) {
String s1 = string1;
float f1 = float1;
int sum = 0;
for (int i = 0; i < count; ++i) {
String result = s1 + f1;
sum += result.length(); // Make sure the append is not optimized away.
}
if (sum != count * (s1.length() + Float.toString(f1).length())) {
throw new AssertionError();
}
}

public void timeAppendStringAndHugeFloat(int count) {
String s1 = string1;
float f2 = float2;
int sum = 0;
for (int i = 0; i < count; ++i) {
String result = s1 + f2;
sum += result.length(); // Make sure the append is not optimized away.
}
if (sum != count * (s1.length() + Float.toString(f2).length())) {
throw new AssertionError();
}
}

public void timeAppendStringDoubleStringAndFloat(int count) {
String s1 = string1;
String s2 = string2;
double d1 = double1;
float f1 = float1;
int sum = 0;
for (int i = 0; i < count; ++i) {
String result = s1 + d1 + s2 + f1;
sum += result.length(); // Make sure the append is not optimized away.
}
if (sum != count * (s1.length() +
Double.toString(d1).length() +
s2.length() +
Float.toString(f1).length())) {
throw new AssertionError();
}
}
}
17 changes: 11 additions & 6 deletions compiler/optimizing/instruction_simplifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2652,6 +2652,7 @@ static bool TryReplaceStringBuilderAppend(HInvoke* invoke) {
bool seen_to_string = false;
uint32_t format = 0u;
uint32_t num_args = 0u;
bool has_fp_args = false;
HInstruction* args[StringBuilderAppend::kMaxArgs]; // Added in reverse order.
for (HBackwardInstructionIterator iter(block->GetInstructions()); !iter.Done(); iter.Advance()) {
HInstruction* user = iter.Current();
Expand Down Expand Up @@ -2697,6 +2698,14 @@ static bool TryReplaceStringBuilderAppend(HInvoke* invoke) {
case Intrinsics::kStringBuilderAppendLong:
arg = StringBuilderAppend::Argument::kLong;
break;
case Intrinsics::kStringBuilderAppendFloat:
arg = StringBuilderAppend::Argument::kFloat;
has_fp_args = true;
break;
case Intrinsics::kStringBuilderAppendDouble:
arg = StringBuilderAppend::Argument::kDouble;
has_fp_args = true;
break;
case Intrinsics::kStringBuilderAppendCharSequence: {
ReferenceTypeInfo rti = user->AsInvokeVirtual()->InputAt(1)->GetReferenceTypeInfo();
if (!rti.IsValid()) {
Expand All @@ -2716,10 +2725,6 @@ static bool TryReplaceStringBuilderAppend(HInvoke* invoke) {
}
break;
}
case Intrinsics::kStringBuilderAppendFloat:
case Intrinsics::kStringBuilderAppendDouble:
// TODO: Unimplemented, needs to call FloatingDecimal.getBinaryToASCIIConverter().
return false;
default: {
return false;
}
Expand Down Expand Up @@ -2772,8 +2777,8 @@ static bool TryReplaceStringBuilderAppend(HInvoke* invoke) {
// Create replacement instruction.
HIntConstant* fmt = block->GetGraph()->GetIntConstant(static_cast<int32_t>(format));
ArenaAllocator* allocator = block->GetGraph()->GetAllocator();
HStringBuilderAppend* append =
new (allocator) HStringBuilderAppend(fmt, num_args, allocator, invoke->GetDexPc());
HStringBuilderAppend* append = new (allocator) HStringBuilderAppend(
fmt, num_args, has_fp_args, allocator, invoke->GetDexPc());
append->SetReferenceTypeInfo(invoke->GetReferenceTypeInfo());
for (size_t i = 0; i != num_args; ++i) {
append->SetArgumentAt(i, args[num_args - 1u - i]);
Expand Down
9 changes: 6 additions & 3 deletions compiler/optimizing/nodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -7503,14 +7503,17 @@ class HStringBuilderAppend final : public HVariableInputSizeInstruction {
public:
HStringBuilderAppend(HIntConstant* format,
uint32_t number_of_arguments,
bool has_fp_args,
ArenaAllocator* allocator,
uint32_t dex_pc)
: HVariableInputSizeInstruction(
kStringBuilderAppend,
DataType::Type::kReference,
// The runtime call may read memory from inputs. It never writes outside
// of the newly allocated result object (or newly allocated helper objects).
SideEffects::AllReads().Union(SideEffects::CanTriggerGC()),
SideEffects::CanTriggerGC().Union(
// The runtime call may read memory from inputs. It never writes outside
// of the newly allocated result object or newly allocated helper objects,
// except for float/double arguments where we reuse thread-local helper objects.
has_fp_args ? SideEffects::AllWritesAndReads() : SideEffects::AllReads()),
dex_pc,
allocator,
number_of_arguments + /* format */ 1u,
Expand Down
2 changes: 2 additions & 0 deletions runtime/class_linker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1163,6 +1163,8 @@ void ClassLinker::RunRootClinits(Thread* self) {
WellKnownClasses::java_lang_invoke_MethodHandles_lookup,
// Ensure `DirectByteBuffer` class is initialized (avoid check at runtime).
WellKnownClasses::java_nio_DirectByteBuffer_init,
// Ensure `FloatingDecimal` class is initialized (avoid check at runtime).
WellKnownClasses::jdk_internal_math_FloatingDecimal_getBinaryToASCIIConverter_D,
// Ensure reflection annotation classes are initialized (avoid check at runtime).
WellKnownClasses::libcore_reflect_AnnotationFactory_createAnnotation,
WellKnownClasses::libcore_reflect_AnnotationMember_init,
Expand Down
4 changes: 2 additions & 2 deletions runtime/image.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
namespace art {

const uint8_t ImageHeader::kImageMagic[] = { 'a', 'r', 't', '\n' };
// Last change: Math.fma(double, double, double) intrinsic.
const uint8_t ImageHeader::kImageVersion[] = { '1', '0', '6', '\0' };
// Last change: StringBuilderAppend for float/double.
const uint8_t ImageHeader::kImageVersion[] = { '1', '0', '7', '\0' };

ImageHeader::ImageHeader(uint32_t image_reservation_size,
uint32_t component_count,
Expand Down
Loading

0 comments on commit 41de450

Please sign in to comment.