Skip to content

Commit

Permalink
Rudimentary commit diffing and text diffing
Browse files Browse the repository at this point in the history
  • Loading branch information
adamziel committed Jan 3, 2025
1 parent 1f27630 commit 2ad968b
Show file tree
Hide file tree
Showing 3 changed files with 280 additions and 1 deletion.
1 change: 1 addition & 0 deletions packages/playground/data-liberation/bootstrap.php
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
require_once __DIR__ . '/src/git/WP_Git_Pack_Processor.php';
require_once __DIR__ . '/src/git/WP_Git_Repository.php';
require_once __DIR__ . '/src/git/WP_Git_Filesystem.php';
require_once __DIR__ . '/src/git/WP_Git_Diff_Engine.php';

require_once __DIR__ . '/src/WP_Data_Liberation_HTML_Processor.php';
require_once __DIR__ . '/src/utf8_decoder.php';
Expand Down
158 changes: 158 additions & 0 deletions packages/playground/data-liberation/src/git/WP_Git_Diff_Engine.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
<?php

class WP_Git_Diff_Engine {

public function diff($oldString, $newString) {
$oldLines = explode("\n", $oldString);
$newLines = explode("\n", $newString);

$lcs = $this->calculateLCS($oldLines, $newLines);

$oldIndex = 0;
$newIndex = 0;
$changes = [];

foreach ($lcs as $match) {
while ($oldIndex < $match['oldIndex'] || $newIndex < $match['newIndex']) {
if ($oldIndex < $match['oldIndex']) {
$changes[] = ['type' => '-', 'line' => $oldLines[$oldIndex], 'oldIndex' => $oldIndex, 'newIndex' => null];
$oldIndex++;
}
if ($newIndex < $match['newIndex']) {
$changes[] = ['type' => '+', 'line' => $newLines[$newIndex], 'oldIndex' => null, 'newIndex' => $newIndex];
$newIndex++;
}
}

// Add matching line as context
if ($oldIndex < count($oldLines) && $newIndex < count($newLines)) {
$changes[] = ['type' => ' ', 'line' => $oldLines[$oldIndex], 'oldIndex' => $oldIndex, 'newIndex' => $newIndex];
$oldIndex++;
$newIndex++;
}
}

// Add remaining lines
while ($oldIndex < count($oldLines)) {
$changes[] = ['type' => '-', 'line' => $oldLines[$oldIndex], 'oldIndex' => $oldIndex, 'newIndex' => null];
$oldIndex++;
}
while ($newIndex < count($newLines)) {
$changes[] = ['type' => '+', 'line' => $newLines[$newIndex], 'oldIndex' => null, 'newIndex' => $newIndex];
$newIndex++;
}

return $changes;
}

public function formatAsGit($changes, $options = []) {
$options['contextLines'] ??= 3;
$options['a_source'] ??= 'a/string';
$options['b_source'] ??= 'b/string';

// Format the diff to Git-style with context
$formattedDiff = "diff --git " . $options['a_source'] . " " . $options['b_source'] . "\n";
$formattedDiff .= "--- " . $options['a_source'] . "\n";
$formattedDiff .= "+++ " . $options['b_source'] . "\n";

$changeBlocks = [];
$currentBlock = [];

$last_changed_lineno = null;
foreach ($changes as $lineno => $change) {
if ($change['type'] === ' ') {
if(empty($currentBlock)) {
continue;
}
if($lineno - $last_changed_lineno > $options['contextLines']) {
$changeBlocks[] = $currentBlock;
$currentBlock = [];
continue;
}
} else if(empty($currentBlock)) {
$offset = max(0, $lineno - $options['contextLines'] - 1);
$length = min($options['contextLines'], count($changes) - $offset) - 1;
$currentBlock = array_slice($changes, $offset, $length);
}

$currentBlock[] = $change;

if($change['type'] !== ' ') {
$last_changed_lineno = $lineno;
}
}

if(!empty($currentBlock)) {
$changeBlocks[] = $currentBlock;
}

foreach ($changeBlocks as $changes) {
$block = '';
$oldStart = null;
$newStart = null;
$oldCount = 0;
$newCount = 0;

foreach ($changes as $change) {
if ($change['type'] !== '+') {
if ($oldStart === null) $oldStart = $change['oldIndex'];
$oldCount++;
}
if ($change['type'] !== '-') {
if ($newStart === null) $newStart = $change['newIndex'];
$newCount++;
}
}

$oldStart = $oldStart !== null ? $oldStart + 1 : 0;
$newStart = $newStart !== null ? $newStart + 1 : 0;

$block .= sprintf("@@ -%d,%d +%d,%d @@", $oldStart, $oldCount, $newStart, $newCount);

foreach ($changes as $change) {
$block .= $change['type'] . ' ' . $change['line'] . "\n";
}

$formattedDiff .= $block;
}

return $formattedDiff;
}

private function calculateLCS($oldLines, $newLines) {
$oldLen = count($oldLines);
$newLen = count($newLines);
$lcsMatrix = array_fill(0, $oldLen + 1, array_fill(0, $newLen + 1, 0));

// Build the LCS matrix
for ($i = 1; $i <= $oldLen; $i++) {
for ($j = 1; $j <= $newLen; $j++) {
if ($oldLines[$i - 1] === $newLines[$j - 1]) {
$lcsMatrix[$i][$j] = $lcsMatrix[$i - 1][$j - 1] + 1;
} else {
$lcsMatrix[$i][$j] = max($lcsMatrix[$i - 1][$j], $lcsMatrix[$i][$j - 1]);
}
}
}

// Backtrack to find the LCS
$lcs = [];
$i = $oldLen;
$j = $newLen;
while ($i > 0 && $j > 0) {
if ($oldLines[$i - 1] === $newLines[$j - 1]) {
$lcs[] = ['oldIndex' => $i - 1, 'newIndex' => $j - 1];
$i--;
$j--;
} elseif ($lcsMatrix[$i - 1][$j] >= $lcsMatrix[$i][$j - 1]) {
$i--;
} else {
$j--;
}
}

return array_reverse($lcs);
}

}

122 changes: 121 additions & 1 deletion packages/playground/data-liberation/src/git/WP_Git_Repository.php
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,20 @@ class WP_Git_Repository {
private $buffered_object_content;
private $last_error;

/**
* @var WP_Git_Diff_Engine
*/
private $diff_engine;

private const DELETE_PLACEHOLDER = 'DELETE_PLACEHOLDER';
private const NULL_OID = '0000000000000000000000000000000000000000';

public function __construct(
WP_Abstract_Filesystem $fs
WP_Abstract_Filesystem $fs,
$options = []
) {
$this->fs = $fs;
$this->diff_engine = $options['diff_engine'] ?? new WP_Git_Diff_Engine();
$this->initialize_filesystem();
}

Expand Down Expand Up @@ -687,6 +694,119 @@ public function commit($options=[]) {
return $commit_oid;
}

public function diff_commits($current_oid, $previous_oid) {
if(false === $this->read_object($current_oid)) {
return false;
}
$current_commit = $this->get_parsed_commit();
$current_tree_oid = $current_commit['tree'];

if(false === $this->read_object($previous_oid)) {
return false;
}
$previous_commit = $this->get_parsed_commit();
$previous_tree_oid = $previous_commit['tree'];

return $this->diff_trees($current_tree_oid, $previous_tree_oid);
}

public function diff_trees($current_oid, $previous_oid) {
if(false === $this->read_object($current_oid)) {
return false;
}
$current_tree = $this->get_parsed_tree();

if(false === $this->read_object($previous_oid)) {
return false;
}
$previous_tree = $this->get_parsed_tree();

$diff = [];
foreach($current_tree as $name => $current_entry) {
if(!isset($previous_tree[$name])) {
$diff[$name] = $current_entry;
continue;
}
$previous_entry = $previous_tree[$name];
if($current_entry['sha1'] === $previous_entry['sha1']) {
continue;
}

if($current_entry['mode'] !== $previous_entry['mode']) {
/*
* @TODO: Account for a scenario when just one text file changes and
* also the mode changed from executable to non-executable.
* We could do a text diff in that case.
*/
$diff[$name] = $current_entry;
continue;
}

$diff[$name] = [
'name' => $name,
'mode' => 'diff',
'sha1' => $current_entry['sha1'],
];

if($current_entry['mode'] === WP_Git_Pack_Processor::FILE_MODE_DIRECTORY) {
$diff[$name]['diff'] = $this->diff_trees($current_entry['sha1'], $previous_entry['sha1']);
} else {
$diff[$name]['diff'] = $this->diff_blobs(
$current_entry,
$previous_entry
);
}
}

foreach($previous_tree as $name => $previous_entry) {
if(!isset($current_tree[$name])) {
$diff[$name] = self::DELETE_PLACEHOLDER;
}
}
return $diff;
}

public function diff_blobs($current_blob_entry, $previous_blob_entry) {
if(false === $this->read_object($current_blob_entry['sha1'])) {
return false;
}
// @TODO: Support streaming diffs for large files
$current_blob = $this->read_entire_object_contents();
$current_blob_is_binary = $this->guess_if_binary_blob($current_blob_entry, $current_blob);

if(false === $this->read_object($previous_blob_entry['sha1'])) {
return false;
}
$previous_blob = $this->read_entire_object_contents();
$previous_blob_is_binary = $this->guess_if_binary_blob($previous_blob_entry, $previous_blob);

if($current_blob_is_binary && $previous_blob_is_binary) {
return ['type' => 'binary'];
} else if($current_blob_is_binary ^ $previous_blob_is_binary) {
return ['type' => 'completely_new_blob'];
} else {
return [
'type' => 'text_diff',
'diff' => $this->diff_engine->diff($current_blob, $previous_blob)
];
}
}

static private function guess_if_binary_blob($blob_entry, $blob_contents) {
$name = $blob_entry['name'];
$extension = pathinfo($name, PATHINFO_EXTENSION);
if(in_array($extension, ['png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', 'bmp', 'tiff', 'tif', 'raw', 'heic', 'heif', 'avif'])) {
return true;
}

// Naively assume null bytes only occur in binary files
if(strpos($blob_contents, "\0") !== false) {
return true;
}

return false;
}

public function squash($squash_into_commit_oid, $squash_until_ancestor_oid) {
// Find the parent of the squashed range
$this->read_object($squash_until_ancestor_oid);
Expand Down

0 comments on commit 2ad968b

Please sign in to comment.