diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a273c52..60f34ac 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,9 +16,16 @@ This is a partial fork of [diff-match-patch][], with extra bits to make this a modern, friendly member of the Python packaging ecosystem. The library will be periodically updated with changes -from the upstream project. If you would like to -contribute fixes or improvements to the library -itself, and not the packaging code, please submit -them to the upstream library directly. +from the upstream project. + +Since August 2024, Google's diff-match-patch +library is archived, but there is a +[maintained fork][diff-match-patch-maintained]. + +If you would like to contribute fixes or +improvements to the library itself, and not the +packaging code, please submit them to the upstream +maintained fork library directly. [diff-match-patch]: https://github.com/google/diff-match-patch +[diff-match-patch-maintained]: https://github.com/dmsnell/diff-match-patch \ No newline at end of file diff --git a/README.md b/README.md index bdcd2a4..aa44a84 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,9 @@ Google's [Diff Match and Patch][DMP] library, packaged for modern Python. +Since August 2024, Googles diff-match-patch library is archived, and +this project will now track the [maintained fork][diff-match-patch-maintained]. + [![version](https://img.shields.io/pypi/v/diff-match-patch.svg)](https://pypi.org/project/diff-match-patch) [![changelog](https://img.shields.io/badge/change-log-blue)](https://github.com/diff-match-patch-python/diff-match-patch/blob/main/CHANGELOG.md) [![license](https://img.shields.io/pypi/l/diff-match-patch.svg)](https://github.com/diff-match-patch-python/diff-match-patch/blob/master/LICENSE) @@ -82,3 +85,4 @@ This library also implements a [Bitap matching algorithm](https://neil.fraser.na [DMP]: https://github.com/google/diff-match-patch [API]: https://github.com/google/diff-match-patch/wiki/API +[diff-match-patch-maintained]: https://github.com/dmsnell/diff-match-patch \ No newline at end of file diff --git a/diff_match_patch/diff_match_patch.py b/diff_match_patch/diff_match_patch.py index 683f948..6f5be02 100644 --- a/diff_match_patch/diff_match_patch.py +++ b/diff_match_patch/diff_match_patch.py @@ -1136,6 +1136,8 @@ def diff_prettyHtml(self, diffs): """ html = [] for op, data in diffs: + if 0 == len(data): + continue text = ( data.replace("&", "&") .replace("<", "<") @@ -1225,9 +1227,9 @@ def diff_toDelta(self, diffs): data = data.encode("utf-8") text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# ")) elif op == self.DIFF_DELETE: - text.append("-%d" % len(data)) + text.append("-%d" % (len(data.encode("utf-16-be")) // 2)) elif op == self.DIFF_EQUAL: - text.append("=%d" % len(data)) + text.append("=%d" % (len(data.encode("utf-16-be")) // 2)) return "\t".join(text) def diff_fromDelta(self, text1, delta): @@ -1245,7 +1247,8 @@ def diff_fromDelta(self, text1, delta): ValueError: If invalid input. """ diffs = [] - pointer = 0 # Cursor in text1 + as_utf16 = text1.encode("utf-16-be") + pointer = 0 # Cursor in as_utf16 tokens = delta.split("\t") for token in tokens: if token == "": @@ -1264,8 +1267,8 @@ def diff_fromDelta(self, text1, delta): raise ValueError("Invalid number in diff_fromDelta: " + param) if n < 0: raise ValueError("Negative number in diff_fromDelta: " + param) - text = text1[pointer : pointer + n] - pointer += n + text = as_utf16[pointer : pointer + n * 2].decode("utf-16-be") + pointer += n * 2 if token[0] == "=": diffs.append((self.DIFF_EQUAL, text)) else: @@ -1275,10 +1278,10 @@ def diff_fromDelta(self, text1, delta): raise ValueError( "Invalid diff operation in diff_fromDelta: " + token[0] ) - if pointer != len(text1): + if pointer != len(as_utf16): raise ValueError( "Delta length (%d) does not equal source text length (%d)." - % (pointer, len(text1)) + % (pointer, len(as_utf16)) ) return diffs diff --git a/diff_match_patch/tests/diff_match_patch_test.py b/diff_match_patch/tests/diff_match_patch_test.py index 64a6666..1912c9a 100644 --- a/diff_match_patch/tests/diff_match_patch_test.py +++ b/diff_match_patch/tests/diff_match_patch_test.py @@ -833,6 +833,16 @@ def testDiffDelta(self): # Convert delta string into a diff. self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta)) + diffs = self.dmp.diff_main( + "\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B" + ) + delta = self.dmp.diff_toDelta(diffs) + self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta) + self.assertEqual( + diffs, + self.dmp.diff_fromDelta("\U0001F64B\U0001F64B", "=2\t+%F0%9F%99%8C\t=2"), + ) + # Verify pool of unchanged characters. diffs = [ ( @@ -849,6 +859,76 @@ def testDiffDelta(self): # Convert delta string into a diff. self.assertEqual(diffs, self.dmp.diff_fromDelta("", delta)) + # Unicode: split surrogates + self.assertEqual( + self.dmp.diff_toDelta( + [ + (self.dmp.DIFF_INSERT, "\U0001F171"), + (self.dmp.DIFF_EQUAL, "\U0001F170\U0001F171"), + ] + ), + self.dmp.diff_toDelta( + self.dmp.diff_main( + "\U0001F170\U0001F171", "\U0001F171\U0001F170\U0001F171" + ) + ), + "Inserting similar surrogate pair at beginning", + ) + self.assertEqual( + self.dmp.diff_toDelta( + [ + (self.dmp.DIFF_EQUAL, "\U0001F170"), + (self.dmp.DIFF_INSERT, "\U0001F172"), + (self.dmp.DIFF_EQUAL, "\U0001F171"), + ] + ), + self.dmp.diff_toDelta( + self.dmp.diff_main( + "\U0001F170\U0001F171", "\U0001F170\U0001F172\U0001F171" + ) + ), + "Inserting similar surrogate pair in the middle", + ) + self.assertEqual( + self.dmp.diff_toDelta( + [ + (self.dmp.DIFF_DELETE, "\U0001F171"), + (self.dmp.DIFF_EQUAL, "\U0001F170\U0001F171"), + ] + ), + self.dmp.diff_toDelta( + self.dmp.diff_main( + "\U0001F171\U0001F170\U0001F171", "\U0001F170\U0001F171" + ) + ), + "Deleting similar surogate pair at the beginning", + ) + self.assertEqual( + self.dmp.diff_toDelta( + [ + (self.dmp.DIFF_EQUAL, "\U0001F170"), + (self.dmp.DIFF_DELETE, "\U0001F172"), + (self.dmp.DIFF_EQUAL, "\U0001F171"), + ] + ), + self.dmp.diff_toDelta( + self.dmp.diff_main( + "\U0001F170\U0001F172\U0001F171", "\U0001F170\U0001F171" + ) + ), + "Deleting similar surogate pair in the middle", + ) + self.assertEqual( + self.dmp.diff_toDelta( + [ + (self.dmp.DIFF_DELETE, "\U0001F170"), + (self.dmp.DIFF_INSERT, "\U0001F171"), + ] + ), + self.dmp.diff_toDelta(self.dmp.diff_main("\U0001F170", "\U0001F171")), + "Swap surrogate pair", + ) + # 160 kb string. a = "abcdefghij" for i in range(14):