From 3822fbe5d24f3c08a5e23d870f7928067e524b94 Mon Sep 17 00:00:00 2001 From: matthewhegarty Date: Sat, 19 Oct 2024 09:49:16 +0100 Subject: [PATCH 1/5] patch from upstream: 'Python3: Stop breaking surrogate pairs in toDelta()' --- diff_match_patch/diff_match_patch.py | 17 +++-- .../tests/diff_match_patch_test.py | 64 +++++++++++++++++++ 2 files changed, 74 insertions(+), 7 deletions(-) diff --git a/diff_match_patch/diff_match_patch.py b/diff_match_patch/diff_match_patch.py index 683f948..a9b909f 100644 --- a/diff_match_patch/diff_match_patch.py +++ b/diff_match_patch/diff_match_patch.py @@ -1136,6 +1136,8 @@ def diff_prettyHtml(self, diffs): """ html = [] for op, data in diffs: + if 0 == len(data): + continue text = ( data.replace("&", "&") .replace("<", "<") @@ -1225,9 +1227,9 @@ def diff_toDelta(self, diffs): data = data.encode("utf-8") text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# ")) elif op == self.DIFF_DELETE: - text.append("-%d" % len(data)) + text.append("-%d" % (len(data.encode('utf-16-be')) // 2)) elif op == self.DIFF_EQUAL: - text.append("=%d" % len(data)) + text.append("=%d" % (len(data.encode('utf-16-be')) // 2)) return "\t".join(text) def diff_fromDelta(self, text1, delta): @@ -1245,7 +1247,8 @@ def diff_fromDelta(self, text1, delta): ValueError: If invalid input. """ diffs = [] - pointer = 0 # Cursor in text1 + as_utf16 = text1.encode('utf-16-be') + pointer = 0 # Cursor in as_utf16 tokens = delta.split("\t") for token in tokens: if token == "": @@ -1264,8 +1267,8 @@ def diff_fromDelta(self, text1, delta): raise ValueError("Invalid number in diff_fromDelta: " + param) if n < 0: raise ValueError("Negative number in diff_fromDelta: " + param) - text = text1[pointer : pointer + n] - pointer += n + text = as_utf16[pointer: pointer + n * 2].decode('utf-16-be') + pointer += n * 2 if token[0] == "=": diffs.append((self.DIFF_EQUAL, text)) else: @@ -1275,10 +1278,10 @@ def diff_fromDelta(self, text1, delta): raise ValueError( "Invalid diff operation in diff_fromDelta: " + token[0] ) - if pointer != len(text1): + if pointer != len(as_utf16): raise ValueError( "Delta length (%d) does not equal source text length (%d)." - % (pointer, len(text1)) + % (pointer, len(as_utf16)) ) return diffs diff --git a/diff_match_patch/tests/diff_match_patch_test.py b/diff_match_patch/tests/diff_match_patch_test.py index 64a6666..8457716 100644 --- a/diff_match_patch/tests/diff_match_patch_test.py +++ b/diff_match_patch/tests/diff_match_patch_test.py @@ -833,6 +833,11 @@ def testDiffDelta(self): # Convert delta string into a diff. self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta)) + diffs = self.dmp.diff_main("\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B") + delta = self.dmp.diff_toDelta(diffs) + self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta) + self.assertEqual(diffs, self.dmp.diff_fromDelta("\U0001F64B\U0001F64B", "=2\t+%F0%9F%99%8C\t=2")) + # Verify pool of unchanged characters. diffs = [ ( @@ -849,6 +854,65 @@ def testDiffDelta(self): # Convert delta string into a diff. self.assertEqual(diffs, self.dmp.diff_fromDelta("", delta)) + # Unicode: split surrogates + self.assertEqual( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_INSERT, '\U0001F171'), + (self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + '\U0001F170\U0001F171', + '\U0001F171\U0001F170\U0001F171' + )), + 'Inserting similar surrogate pair at beginning' + ) + self.assertEqual( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_EQUAL, '\U0001F170'), + (self.dmp.DIFF_INSERT, '\U0001F172'), + (self.dmp.DIFF_EQUAL, '\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + '\U0001F170\U0001F171', + '\U0001F170\U0001F172\U0001F171' + )), + 'Inserting similar surrogate pair in the middle' + ) + self.assertEqual( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_DELETE, '\U0001F171'), + (self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + '\U0001F171\U0001F170\U0001F171', + '\U0001F170\U0001F171' + )), + 'Deleting similar surogate pair at the beginning' + ) + self.assertEqual( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_EQUAL, '\U0001F170'), + (self.dmp.DIFF_DELETE, '\U0001F172'), + (self.dmp.DIFF_EQUAL, '\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + '\U0001F170\U0001F172\U0001F171', + '\U0001F170\U0001F171' + )), + 'Deleting similar surogate pair in the middle' + ) + self.assertEqual( + self.dmp.diff_toDelta([ + (self.dmp.DIFF_DELETE, '\U0001F170'), + (self.dmp.DIFF_INSERT, '\U0001F171') + ]), + self.dmp.diff_toDelta(self.dmp.diff_main( + '\U0001F170', + '\U0001F171' + )), + 'Swap surrogate pair' + ) + # 160 kb string. a = "abcdefghij" for i in range(14): From 5081a1e8cc135e6f8832446f2cd9cec5f4faded0 Mon Sep 17 00:00:00 2001 From: matthewhegarty Date: Sat, 19 Oct 2024 12:59:42 +0100 Subject: [PATCH 2/5] updated README --- CONTRIBUTING.md | 15 +++++++++++---- README.md | 3 +++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a273c52..1044260 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,9 +16,16 @@ This is a partial fork of [diff-match-patch][], with extra bits to make this a modern, friendly member of the Python packaging ecosystem. The library will be periodically updated with changes -from the upstream project. If you would like to -contribute fixes or improvements to the library -itself, and not the packaging code, please submit -them to the upstream library directly. +from the upstream project. + +Since August 2024, Google's diff_match_patch +library is archived, but there is a +[maintained][diff-match-patch-maintained] fork. + +If you would like to contribute fixes or +improvements to the library itself, and not the +packaging code, please submit them to the upstream +maintained fork library directly. [diff-match-patch]: https://github.com/google/diff-match-patch +[diff-match-patch-maintained]: https://github.com/dmsnell/diff-match-patch \ No newline at end of file diff --git a/README.md b/README.md index bdcd2a4..32acb3c 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ Google's [Diff Match and Patch][DMP] library, packaged for modern Python. +Since August 2024, this project will now track the [maintained fork][diff-match-patch-maintained]. + [![version](https://img.shields.io/pypi/v/diff-match-patch.svg)](https://pypi.org/project/diff-match-patch) [![changelog](https://img.shields.io/badge/change-log-blue)](https://github.com/diff-match-patch-python/diff-match-patch/blob/main/CHANGELOG.md) [![license](https://img.shields.io/pypi/l/diff-match-patch.svg)](https://github.com/diff-match-patch-python/diff-match-patch/blob/master/LICENSE) @@ -82,3 +84,4 @@ This library also implements a [Bitap matching algorithm](https://neil.fraser.na [DMP]: https://github.com/google/diff-match-patch [API]: https://github.com/google/diff-match-patch/wiki/API +[diff-match-patch-maintained]: https://github.com/dmsnell/diff-match-patch \ No newline at end of file From 8903567edf63fd10046c78e25315ffc4bfe53974 Mon Sep 17 00:00:00 2001 From: matthewhegarty Date: Sat, 19 Oct 2024 13:02:49 +0100 Subject: [PATCH 3/5] minor reformatting --- CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1044260..60f34ac 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,9 +18,9 @@ member of the Python packaging ecosystem. The library will be periodically updated with changes from the upstream project. -Since August 2024, Google's diff_match_patch +Since August 2024, Google's diff-match-patch library is archived, but there is a -[maintained][diff-match-patch-maintained] fork. +[maintained fork][diff-match-patch-maintained]. If you would like to contribute fixes or improvements to the library itself, and not the From e9412a5b1d837eb14a2467f8291102c8fb4644fc Mon Sep 17 00:00:00 2001 From: matthewhegarty Date: Sat, 19 Oct 2024 13:04:17 +0100 Subject: [PATCH 4/5] clarified language --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 32acb3c..aa44a84 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ Google's [Diff Match and Patch][DMP] library, packaged for modern Python. -Since August 2024, this project will now track the [maintained fork][diff-match-patch-maintained]. +Since August 2024, Googles diff-match-patch library is archived, and +this project will now track the [maintained fork][diff-match-patch-maintained]. [![version](https://img.shields.io/pypi/v/diff-match-patch.svg)](https://pypi.org/project/diff-match-patch) [![changelog](https://img.shields.io/badge/change-log-blue)](https://github.com/diff-match-patch-python/diff-match-patch/blob/main/CHANGELOG.md) From 325766c2eefd42378a26959fdd328bac6223fa75 Mon Sep 17 00:00:00 2001 From: matthewhegarty Date: Sat, 19 Oct 2024 13:06:05 +0100 Subject: [PATCH 5/5] applied formatting --- diff_match_patch/diff_match_patch.py | 8 +- .../tests/diff_match_patch_test.py | 130 ++++++++++-------- 2 files changed, 77 insertions(+), 61 deletions(-) diff --git a/diff_match_patch/diff_match_patch.py b/diff_match_patch/diff_match_patch.py index a9b909f..6f5be02 100644 --- a/diff_match_patch/diff_match_patch.py +++ b/diff_match_patch/diff_match_patch.py @@ -1227,9 +1227,9 @@ def diff_toDelta(self, diffs): data = data.encode("utf-8") text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# ")) elif op == self.DIFF_DELETE: - text.append("-%d" % (len(data.encode('utf-16-be')) // 2)) + text.append("-%d" % (len(data.encode("utf-16-be")) // 2)) elif op == self.DIFF_EQUAL: - text.append("=%d" % (len(data.encode('utf-16-be')) // 2)) + text.append("=%d" % (len(data.encode("utf-16-be")) // 2)) return "\t".join(text) def diff_fromDelta(self, text1, delta): @@ -1247,7 +1247,7 @@ def diff_fromDelta(self, text1, delta): ValueError: If invalid input. """ diffs = [] - as_utf16 = text1.encode('utf-16-be') + as_utf16 = text1.encode("utf-16-be") pointer = 0 # Cursor in as_utf16 tokens = delta.split("\t") for token in tokens: @@ -1267,7 +1267,7 @@ def diff_fromDelta(self, text1, delta): raise ValueError("Invalid number in diff_fromDelta: " + param) if n < 0: raise ValueError("Negative number in diff_fromDelta: " + param) - text = as_utf16[pointer: pointer + n * 2].decode('utf-16-be') + text = as_utf16[pointer : pointer + n * 2].decode("utf-16-be") pointer += n * 2 if token[0] == "=": diffs.append((self.DIFF_EQUAL, text)) diff --git a/diff_match_patch/tests/diff_match_patch_test.py b/diff_match_patch/tests/diff_match_patch_test.py index 8457716..1912c9a 100644 --- a/diff_match_patch/tests/diff_match_patch_test.py +++ b/diff_match_patch/tests/diff_match_patch_test.py @@ -833,10 +833,15 @@ def testDiffDelta(self): # Convert delta string into a diff. self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta)) - diffs = self.dmp.diff_main("\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B") + diffs = self.dmp.diff_main( + "\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B" + ) delta = self.dmp.diff_toDelta(diffs) self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta) - self.assertEqual(diffs, self.dmp.diff_fromDelta("\U0001F64B\U0001F64B", "=2\t+%F0%9F%99%8C\t=2")) + self.assertEqual( + diffs, + self.dmp.diff_fromDelta("\U0001F64B\U0001F64B", "=2\t+%F0%9F%99%8C\t=2"), + ) # Verify pool of unchanged characters. diffs = [ @@ -856,61 +861,72 @@ def testDiffDelta(self): # Unicode: split surrogates self.assertEqual( - self.dmp.diff_toDelta([ - (self.dmp.DIFF_INSERT, '\U0001F171'), - (self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171') - ]), - self.dmp.diff_toDelta(self.dmp.diff_main( - '\U0001F170\U0001F171', - '\U0001F171\U0001F170\U0001F171' - )), - 'Inserting similar surrogate pair at beginning' - ) - self.assertEqual( - self.dmp.diff_toDelta([ - (self.dmp.DIFF_EQUAL, '\U0001F170'), - (self.dmp.DIFF_INSERT, '\U0001F172'), - (self.dmp.DIFF_EQUAL, '\U0001F171') - ]), - self.dmp.diff_toDelta(self.dmp.diff_main( - '\U0001F170\U0001F171', - '\U0001F170\U0001F172\U0001F171' - )), - 'Inserting similar surrogate pair in the middle' - ) - self.assertEqual( - self.dmp.diff_toDelta([ - (self.dmp.DIFF_DELETE, '\U0001F171'), - (self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171') - ]), - self.dmp.diff_toDelta(self.dmp.diff_main( - '\U0001F171\U0001F170\U0001F171', - '\U0001F170\U0001F171' - )), - 'Deleting similar surogate pair at the beginning' - ) - self.assertEqual( - self.dmp.diff_toDelta([ - (self.dmp.DIFF_EQUAL, '\U0001F170'), - (self.dmp.DIFF_DELETE, '\U0001F172'), - (self.dmp.DIFF_EQUAL, '\U0001F171') - ]), - self.dmp.diff_toDelta(self.dmp.diff_main( - '\U0001F170\U0001F172\U0001F171', - '\U0001F170\U0001F171' - )), - 'Deleting similar surogate pair in the middle' - ) - self.assertEqual( - self.dmp.diff_toDelta([ - (self.dmp.DIFF_DELETE, '\U0001F170'), - (self.dmp.DIFF_INSERT, '\U0001F171') - ]), - self.dmp.diff_toDelta(self.dmp.diff_main( - '\U0001F170', - '\U0001F171' - )), - 'Swap surrogate pair' + self.dmp.diff_toDelta( + [ + (self.dmp.DIFF_INSERT, "\U0001F171"), + (self.dmp.DIFF_EQUAL, "\U0001F170\U0001F171"), + ] + ), + self.dmp.diff_toDelta( + self.dmp.diff_main( + "\U0001F170\U0001F171", "\U0001F171\U0001F170\U0001F171" + ) + ), + "Inserting similar surrogate pair at beginning", + ) + self.assertEqual( + self.dmp.diff_toDelta( + [ + (self.dmp.DIFF_EQUAL, "\U0001F170"), + (self.dmp.DIFF_INSERT, "\U0001F172"), + (self.dmp.DIFF_EQUAL, "\U0001F171"), + ] + ), + self.dmp.diff_toDelta( + self.dmp.diff_main( + "\U0001F170\U0001F171", "\U0001F170\U0001F172\U0001F171" + ) + ), + "Inserting similar surrogate pair in the middle", + ) + self.assertEqual( + self.dmp.diff_toDelta( + [ + (self.dmp.DIFF_DELETE, "\U0001F171"), + (self.dmp.DIFF_EQUAL, "\U0001F170\U0001F171"), + ] + ), + self.dmp.diff_toDelta( + self.dmp.diff_main( + "\U0001F171\U0001F170\U0001F171", "\U0001F170\U0001F171" + ) + ), + "Deleting similar surogate pair at the beginning", + ) + self.assertEqual( + self.dmp.diff_toDelta( + [ + (self.dmp.DIFF_EQUAL, "\U0001F170"), + (self.dmp.DIFF_DELETE, "\U0001F172"), + (self.dmp.DIFF_EQUAL, "\U0001F171"), + ] + ), + self.dmp.diff_toDelta( + self.dmp.diff_main( + "\U0001F170\U0001F172\U0001F171", "\U0001F170\U0001F171" + ) + ), + "Deleting similar surogate pair in the middle", + ) + self.assertEqual( + self.dmp.diff_toDelta( + [ + (self.dmp.DIFF_DELETE, "\U0001F170"), + (self.dmp.DIFF_INSERT, "\U0001F171"), + ] + ), + self.dmp.diff_toDelta(self.dmp.diff_main("\U0001F170", "\U0001F171")), + "Swap surrogate pair", ) # 160 kb string.