From 2ad968b3165406bbeb2de375bc4365c26c04159d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Fri, 3 Jan 2025 18:02:25 +0100 Subject: [PATCH] Rudimentary commit diffing and text diffing --- .../playground/data-liberation/bootstrap.php | 1 + .../src/git/WP_Git_Diff_Engine.php | 158 ++++++++++++++++++ .../src/git/WP_Git_Repository.php | 122 +++++++++++++- 3 files changed, 280 insertions(+), 1 deletion(-) create mode 100644 packages/playground/data-liberation/src/git/WP_Git_Diff_Engine.php diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index a9c6cbacd4..5d5419f9b8 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -92,6 +92,7 @@ require_once __DIR__ . '/src/git/WP_Git_Pack_Processor.php'; require_once __DIR__ . '/src/git/WP_Git_Repository.php'; require_once __DIR__ . '/src/git/WP_Git_Filesystem.php'; +require_once __DIR__ . '/src/git/WP_Git_Diff_Engine.php'; require_once __DIR__ . '/src/WP_Data_Liberation_HTML_Processor.php'; require_once __DIR__ . '/src/utf8_decoder.php'; diff --git a/packages/playground/data-liberation/src/git/WP_Git_Diff_Engine.php b/packages/playground/data-liberation/src/git/WP_Git_Diff_Engine.php new file mode 100644 index 0000000000..07be4e3d4a --- /dev/null +++ b/packages/playground/data-liberation/src/git/WP_Git_Diff_Engine.php @@ -0,0 +1,158 @@ +calculateLCS($oldLines, $newLines); + + $oldIndex = 0; + $newIndex = 0; + $changes = []; + + foreach ($lcs as $match) { + while ($oldIndex < $match['oldIndex'] || $newIndex < $match['newIndex']) { + if ($oldIndex < $match['oldIndex']) { + $changes[] = ['type' => '-', 'line' => $oldLines[$oldIndex], 'oldIndex' => $oldIndex, 'newIndex' => null]; + $oldIndex++; + } + if ($newIndex < $match['newIndex']) { + $changes[] = ['type' => '+', 'line' => $newLines[$newIndex], 'oldIndex' => null, 'newIndex' => $newIndex]; + $newIndex++; + } + } + + // Add matching line as context + if ($oldIndex < count($oldLines) && $newIndex < count($newLines)) { + $changes[] = ['type' => ' ', 'line' => $oldLines[$oldIndex], 'oldIndex' => $oldIndex, 'newIndex' => $newIndex]; + $oldIndex++; + $newIndex++; + } + } + + // Add remaining lines + while ($oldIndex < count($oldLines)) { + $changes[] = ['type' => '-', 'line' => $oldLines[$oldIndex], 'oldIndex' => $oldIndex, 'newIndex' => null]; + $oldIndex++; + } + while ($newIndex < count($newLines)) { + $changes[] = ['type' => '+', 'line' => $newLines[$newIndex], 'oldIndex' => null, 'newIndex' => $newIndex]; + $newIndex++; + } + + return $changes; + } + + public function formatAsGit($changes, $options = []) { + $options['contextLines'] ??= 3; + $options['a_source'] ??= 'a/string'; + $options['b_source'] ??= 'b/string'; + + // Format the diff to Git-style with context + $formattedDiff = "diff --git " . $options['a_source'] . " " . $options['b_source'] . "\n"; + $formattedDiff .= "--- " . $options['a_source'] . "\n"; + $formattedDiff .= "+++ " . $options['b_source'] . "\n"; + + $changeBlocks = []; + $currentBlock = []; + + $last_changed_lineno = null; + foreach ($changes as $lineno => $change) { + if ($change['type'] === ' ') { + if(empty($currentBlock)) { + continue; + } + if($lineno - $last_changed_lineno > $options['contextLines']) { + $changeBlocks[] = $currentBlock; + $currentBlock = []; + continue; + } + } else if(empty($currentBlock)) { + $offset = max(0, $lineno - $options['contextLines'] - 1); + $length = min($options['contextLines'], count($changes) - $offset) - 1; + $currentBlock = array_slice($changes, $offset, $length); + } + + $currentBlock[] = $change; + + if($change['type'] !== ' ') { + $last_changed_lineno = $lineno; + } + } + + if(!empty($currentBlock)) { + $changeBlocks[] = $currentBlock; + } + + foreach ($changeBlocks as $changes) { + $block = ''; + $oldStart = null; + $newStart = null; + $oldCount = 0; + $newCount = 0; + + foreach ($changes as $change) { + if ($change['type'] !== '+') { + if ($oldStart === null) $oldStart = $change['oldIndex']; + $oldCount++; + } + if ($change['type'] !== '-') { + if ($newStart === null) $newStart = $change['newIndex']; + $newCount++; + } + } + + $oldStart = $oldStart !== null ? $oldStart + 1 : 0; + $newStart = $newStart !== null ? $newStart + 1 : 0; + + $block .= sprintf("@@ -%d,%d +%d,%d @@", $oldStart, $oldCount, $newStart, $newCount); + + foreach ($changes as $change) { + $block .= $change['type'] . ' ' . $change['line'] . "\n"; + } + + $formattedDiff .= $block; + } + + return $formattedDiff; + } + + private function calculateLCS($oldLines, $newLines) { + $oldLen = count($oldLines); + $newLen = count($newLines); + $lcsMatrix = array_fill(0, $oldLen + 1, array_fill(0, $newLen + 1, 0)); + + // Build the LCS matrix + for ($i = 1; $i <= $oldLen; $i++) { + for ($j = 1; $j <= $newLen; $j++) { + if ($oldLines[$i - 1] === $newLines[$j - 1]) { + $lcsMatrix[$i][$j] = $lcsMatrix[$i - 1][$j - 1] + 1; + } else { + $lcsMatrix[$i][$j] = max($lcsMatrix[$i - 1][$j], $lcsMatrix[$i][$j - 1]); + } + } + } + + // Backtrack to find the LCS + $lcs = []; + $i = $oldLen; + $j = $newLen; + while ($i > 0 && $j > 0) { + if ($oldLines[$i - 1] === $newLines[$j - 1]) { + $lcs[] = ['oldIndex' => $i - 1, 'newIndex' => $j - 1]; + $i--; + $j--; + } elseif ($lcsMatrix[$i - 1][$j] >= $lcsMatrix[$i][$j - 1]) { + $i--; + } else { + $j--; + } + } + + return array_reverse($lcs); + } + +} + diff --git a/packages/playground/data-liberation/src/git/WP_Git_Repository.php b/packages/playground/data-liberation/src/git/WP_Git_Repository.php index d9e1150f6d..9a45ad5ee8 100644 --- a/packages/playground/data-liberation/src/git/WP_Git_Repository.php +++ b/packages/playground/data-liberation/src/git/WP_Git_Repository.php @@ -117,13 +117,20 @@ class WP_Git_Repository { private $buffered_object_content; private $last_error; + /** + * @var WP_Git_Diff_Engine + */ + private $diff_engine; + private const DELETE_PLACEHOLDER = 'DELETE_PLACEHOLDER'; private const NULL_OID = '0000000000000000000000000000000000000000'; public function __construct( - WP_Abstract_Filesystem $fs + WP_Abstract_Filesystem $fs, + $options = [] ) { $this->fs = $fs; + $this->diff_engine = $options['diff_engine'] ?? new WP_Git_Diff_Engine(); $this->initialize_filesystem(); } @@ -687,6 +694,119 @@ public function commit($options=[]) { return $commit_oid; } + public function diff_commits($current_oid, $previous_oid) { + if(false === $this->read_object($current_oid)) { + return false; + } + $current_commit = $this->get_parsed_commit(); + $current_tree_oid = $current_commit['tree']; + + if(false === $this->read_object($previous_oid)) { + return false; + } + $previous_commit = $this->get_parsed_commit(); + $previous_tree_oid = $previous_commit['tree']; + + return $this->diff_trees($current_tree_oid, $previous_tree_oid); + } + + public function diff_trees($current_oid, $previous_oid) { + if(false === $this->read_object($current_oid)) { + return false; + } + $current_tree = $this->get_parsed_tree(); + + if(false === $this->read_object($previous_oid)) { + return false; + } + $previous_tree = $this->get_parsed_tree(); + + $diff = []; + foreach($current_tree as $name => $current_entry) { + if(!isset($previous_tree[$name])) { + $diff[$name] = $current_entry; + continue; + } + $previous_entry = $previous_tree[$name]; + if($current_entry['sha1'] === $previous_entry['sha1']) { + continue; + } + + if($current_entry['mode'] !== $previous_entry['mode']) { + /* + * @TODO: Account for a scenario when just one text file changes and + * also the mode changed from executable to non-executable. + * We could do a text diff in that case. + */ + $diff[$name] = $current_entry; + continue; + } + + $diff[$name] = [ + 'name' => $name, + 'mode' => 'diff', + 'sha1' => $current_entry['sha1'], + ]; + + if($current_entry['mode'] === WP_Git_Pack_Processor::FILE_MODE_DIRECTORY) { + $diff[$name]['diff'] = $this->diff_trees($current_entry['sha1'], $previous_entry['sha1']); + } else { + $diff[$name]['diff'] = $this->diff_blobs( + $current_entry, + $previous_entry + ); + } + } + + foreach($previous_tree as $name => $previous_entry) { + if(!isset($current_tree[$name])) { + $diff[$name] = self::DELETE_PLACEHOLDER; + } + } + return $diff; + } + + public function diff_blobs($current_blob_entry, $previous_blob_entry) { + if(false === $this->read_object($current_blob_entry['sha1'])) { + return false; + } + // @TODO: Support streaming diffs for large files + $current_blob = $this->read_entire_object_contents(); + $current_blob_is_binary = $this->guess_if_binary_blob($current_blob_entry, $current_blob); + + if(false === $this->read_object($previous_blob_entry['sha1'])) { + return false; + } + $previous_blob = $this->read_entire_object_contents(); + $previous_blob_is_binary = $this->guess_if_binary_blob($previous_blob_entry, $previous_blob); + + if($current_blob_is_binary && $previous_blob_is_binary) { + return ['type' => 'binary']; + } else if($current_blob_is_binary ^ $previous_blob_is_binary) { + return ['type' => 'completely_new_blob']; + } else { + return [ + 'type' => 'text_diff', + 'diff' => $this->diff_engine->diff($current_blob, $previous_blob) + ]; + } + } + + static private function guess_if_binary_blob($blob_entry, $blob_contents) { + $name = $blob_entry['name']; + $extension = pathinfo($name, PATHINFO_EXTENSION); + if(in_array($extension, ['png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', 'bmp', 'tiff', 'tif', 'raw', 'heic', 'heif', 'avif'])) { + return true; + } + + // Naively assume null bytes only occur in binary files + if(strpos($blob_contents, "\0") !== false) { + return true; + } + + return false; + } + public function squash($squash_into_commit_oid, $squash_until_ancestor_oid) { // Find the parent of the squashed range $this->read_object($squash_until_ancestor_oid);