Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: Use lazy encoding in UTF-8 encoded string comparison #2021

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ If you are using Maven without the BOM, add this to your dependencies:
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-firestore</artifactId>
<version>3.30.7</version>
<version>3.30.8</version>
</dependency>

```
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,31 @@ public int compare(@Nonnull Value left, @Nonnull Value right) {

/** Compare strings in UTF-8 encoded byte order */
public static int compareUtf8Strings(String left, String right) {
ByteString leftBytes = ByteString.copyFromUtf8(left);
ByteString rightBytes = ByteString.copyFromUtf8(right);
return compareByteStrings(leftBytes, rightBytes);
int i = 0;
while (i < left.length() && i < right.length()) {
int leftCodePoint = left.codePointAt(i);
int rightCodePoint = right.codePointAt(i);

if (leftCodePoint != rightCodePoint) {
if (leftCodePoint < 128 && rightCodePoint < 128) {
// ASCII comparison
return Integer.compare(leftCodePoint, rightCodePoint);
} else {
// UTF-8 encoded byte comparison, substring 2 indexes to cover surrogate pairs
ByteString leftBytes =
ByteString.copyFromUtf8(left.substring(i, Math.min(i + 2, left.length())));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be left.length() - i?

Copy link
Contributor Author

@milaGGL milaGGL Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why? do u mean encode all the remaining strings? 🤔

But we have confirmed that we encountered the characters that differ, which could be up to 2 index, considering surrogate pairs. If we are already promised that we can find the difference within 2 indexes, is there still a need to encode all remaining? for example, "ab👍cdefghjklm..." vs "ab👎ghjkiuytrd....".

I think substrings 2 indexes is already being very generous

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh you're right. I incorrectly thought left.length() was the number of chars in the substring. My bad. Nevermind this comment 🤦

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: @milaGGL I would improve the following comment to be more descriptive (along the lines of what you said above):

changing

// UTF-8 encoded byte comparison, substring 2 indexes to cover surrogate pairs

to

// We have identified a code point difference, so we don't need to
// encode/compare _all_ the remainder of the strings. We only need to 
// compare up to 2 more indexes to cover potential surrogate pairs.

ByteString rightBytes =
ByteString.copyFromUtf8(right.substring(i, Math.min(i + 2, right.length())));
return compareByteStrings(leftBytes, rightBytes);
}
}

// Increment by 2 for surrogate pairs, 1 otherwise
i += Character.charCount(leftCodePoint);
}

// Compare lengths if all characters are equal
return Integer.compare(left.length(), right.length());
}

private int compareBlobs(Value left, Value right) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1169,6 +1169,10 @@ public void snapshotListenerSortsUnicodeStringsSameWayAsServer() throws Exceptio
.set(col.document("e"), map("value", "P"))
.set(col.document("f"), map("value", "︒"))
.set(col.document("g"), map("value", "🐵"))
.set(col.document("h"), map("value", "你好"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This integration test coverage is good proof-of-concept. Please add solid unit test coverage too, as the logic is very delicate and exploits low-level properties of utf8 and utf16 encoding that are not well-known, obvious, or straight-forward.

.set(col.document("i"), map("value", "你顥"))
.set(col.document("j"), map("value", "😁"))
.set(col.document("k"), map("value", "😀"))
.commit()
.get();

Expand All @@ -1192,7 +1196,7 @@ public void snapshotListenerSortsUnicodeStringsSameWayAsServer() throws Exceptio
latch.await();
registration.remove();

assertEquals(queryOrder, Arrays.asList("b", "a", "c", "f", "e", "d", "g"));
assertEquals(queryOrder, Arrays.asList("b", "a", "h", "i", "c", "f", "e", "d", "g", "k", "j"));
assertEquals(queryOrder, listenerOrder);
}

Expand All @@ -1209,6 +1213,10 @@ public void snapshotListenerSortsUnicodeStringsInArraySameWayAsServer() throws E
.set(col.document("e"), map("value", Arrays.asList("P")))
.set(col.document("f"), map("value", Arrays.asList("︒")))
.set(col.document("g"), map("value", Arrays.asList("🐵")))
.set(col.document("h"), map("value", Arrays.asList("你好")))
.set(col.document("i"), map("value", Arrays.asList("你顥")))
.set(col.document("j"), map("value", Arrays.asList("😁")))
.set(col.document("k"), map("value", Arrays.asList("😀")))
.commit()
.get();

Expand All @@ -1232,7 +1240,7 @@ public void snapshotListenerSortsUnicodeStringsInArraySameWayAsServer() throws E
latch.await();
registration.remove();

assertEquals(queryOrder, Arrays.asList("b", "a", "c", "f", "e", "d", "g"));
assertEquals(queryOrder, Arrays.asList("b", "a", "h", "i", "c", "f", "e", "d", "g", "k", "j"));
assertEquals(queryOrder, listenerOrder);
}

Expand All @@ -1249,6 +1257,10 @@ public void snapshotListenerSortsUnicodeStringsInMapSameWayAsServer() throws Exc
.set(col.document("e"), map("value", map("foo", "P")))
.set(col.document("f"), map("value", map("foo", "︒")))
.set(col.document("g"), map("value", map("foo", "🐵")))
.set(col.document("h"), map("value", map("foo", "你好")))
.set(col.document("i"), map("value", map("foo", "你顥")))
.set(col.document("j"), map("value", map("foo", "😁")))
.set(col.document("k"), map("value", map("foo", "😀")))
.commit()
.get();

Expand All @@ -1272,7 +1284,7 @@ public void snapshotListenerSortsUnicodeStringsInMapSameWayAsServer() throws Exc
latch.await();
registration.remove();

assertEquals(queryOrder, Arrays.asList("b", "a", "c", "f", "e", "d", "g"));
assertEquals(queryOrder, Arrays.asList("b", "a", "h", "i", "c", "f", "e", "d", "g", "k", "j"));
assertEquals(queryOrder, listenerOrder);
}

Expand All @@ -1289,6 +1301,10 @@ public void snapshotListenerSortsUnicodeStringsInMapKeySameWayAsServer() throws
.set(col.document("e"), map("value", map("P", "foo")))
.set(col.document("f"), map("value", map("︒", "foo")))
.set(col.document("g"), map("value", map("🐵", "foo")))
.set(col.document("h"), map("value", map("你好", "foo")))
.set(col.document("i"), map("value", map("你顥", "foo")))
.set(col.document("j"), map("value", map("😁", "foo")))
.set(col.document("k"), map("value", map("😀", "foo")))
.commit()
.get();

Expand All @@ -1312,7 +1328,7 @@ public void snapshotListenerSortsUnicodeStringsInMapKeySameWayAsServer() throws
latch.await();
registration.remove();

assertEquals(queryOrder, Arrays.asList("b", "a", "c", "f", "e", "d", "g"));
assertEquals(queryOrder, Arrays.asList("b", "a", "h", "i", "c", "f", "e", "d", "g", "k", "j"));
assertEquals(queryOrder, listenerOrder);
}

Expand All @@ -1329,6 +1345,10 @@ public void snapshotListenerSortsUnicodeStringsInDocumentKeySameWayAsServer() th
.set(col.document("P"), map("value", "foo"))
.set(col.document("︒"), map("value", "foo"))
.set(col.document("🐵"), map("value", "foo"))
.set(col.document("你好"), map("value", "你好"))
.set(col.document("你顥"), map("value", "你顥"))
.set(col.document("😁"), map("value", "😁"))
.set(col.document("😀"), map("value", "😀"))
.commit()
.get();

Expand All @@ -1353,7 +1373,9 @@ public void snapshotListenerSortsUnicodeStringsInDocumentKeySameWayAsServer() th
registration.remove();

assertEquals(
queryOrder, Arrays.asList("Sierpiński", "Łukasiewicz", "岩澤", "︒", "P", "🄟", "🐵"));
queryOrder,
Arrays.asList(
"Sierpiński", "Łukasiewicz", "你好", "你顥", "岩澤", "︒", "P", "🄟", "🐵", "😀", "😁"));
assertEquals(queryOrder, listenerOrder);
}
}