Path: blob/master/src/infrastructure/diff/prose/PhutilProseDifferenceEngine.php
12242 views
<?php12final class PhutilProseDifferenceEngine extends Phobject {34public function getDiff($u, $v) {5return $this->buildDiff($u, $v, 0);6}78private function buildDiff($u, $v, $level) {9$u_parts = $this->splitCorpus($u, $level);10$v_parts = $this->splitCorpus($v, $level);1112if ($level === 0) {13$diff = $this->newHashDiff($u_parts, $v_parts);14$too_large = false;15} else {16list($diff, $too_large) = $this->newEditDistanceMatrixDiff(17$u_parts,18$v_parts,19$level);20}2122$diff->reorderParts();2324// If we just built a character-level diff, we're all done and do not25// need to go any deeper.26if ($level == 3) {27return $diff;28}2930$blocks = array();31$block = null;32foreach ($diff->getParts() as $part) {33$type = $part['type'];34$text = $part['text'];35switch ($type) {36case '=':37if ($block) {38$blocks[] = $block;39$block = null;40}41$blocks[] = array(42'type' => $type,43'text' => $text,44);45break;46case '-':47if (!$block) {48$block = array(49'type' => '!',50'old' => '',51'new' => '',52);53}54$block['old'] .= $text;55break;56case '+':57if (!$block) {58$block = array(59'type' => '!',60'old' => '',61'new' => '',62);63}64$block['new'] .= $text;65break;66}67}6869if ($block) {70$blocks[] = $block;71}7273$result = new PhutilProseDiff();74foreach ($blocks as $block) {75$type = $block['type'];76if ($type == '=') {77$result->addPart('=', $block['text']);78} else {79$old = $block['old'];80$new = $block['new'];81if (!strlen($old) && !strlen($new)) {82// Nothing to do.83} else if (!strlen($old)) {84$result->addPart('+', $new);85} else if (!strlen($new)) {86$result->addPart('-', $old);87} else {88if ($too_large) {89// If this text was too big to diff, don't try to subdivide it.90$result->addPart('-', $old);91$result->addPart('+', $new);92} else {93$subdiff = $this->buildDiff(94$old,95$new,96$level + 1);9798foreach ($subdiff->getParts() as $part) {99$result->addPart($part['type'], $part['text']);100}101}102}103}104}105106$result->reorderParts();107108return $result;109}110111private function splitCorpus($corpus, $level) {112switch ($level) {113case 0:114// Level 0: Split into paragraphs.115$expr = '/([\n]+)/';116break;117case 1:118// Level 1: Split into sentences.119$expr = '/([\n,!;?\.]+)/';120break;121case 2:122// Level 2: Split into words.123$expr = '/(\s+)/';124break;125case 3:126// Level 3: Split into characters.127return phutil_utf8v_combined($corpus);128}129130$pieces = preg_split($expr, $corpus, -1, PREG_SPLIT_DELIM_CAPTURE);131return $this->stitchPieces($pieces, $level);132}133134private function stitchPieces(array $pieces, $level) {135$results = array();136$count = count($pieces);137for ($ii = 0; $ii < $count; $ii += 2) {138$result = $pieces[$ii];139if ($ii + 1 < $count) {140$result .= $pieces[$ii + 1];141}142143if ($level < 2) {144$trimmed_pieces = $this->trimApart($result);145foreach ($trimmed_pieces as $trimmed_piece) {146$results[] = $trimmed_piece;147}148} else {149$results[] = $result;150}151}152153// If the input ended with a delimiter, we can get an empty final piece.154// Just discard it.155if (last($results) == '') {156array_pop($results);157}158159return $results;160}161162private function newEditDistanceMatrixDiff(163array $u_parts,164array $v_parts,165$level) {166167$matrix = id(new PhutilEditDistanceMatrix())168->setMaximumLength(128)169->setSequences($u_parts, $v_parts)170->setComputeString(true);171172// For word-level and character-level changes, smooth the output string173// to reduce the choppiness of the diff.174if ($level > 1) {175$matrix->setApplySmoothing(PhutilEditDistanceMatrix::SMOOTHING_FULL);176}177178$u_pos = 0;179$v_pos = 0;180181$edits = $matrix->getEditString();182$edits_length = strlen($edits);183184$diff = new PhutilProseDiff();185for ($ii = 0; $ii < $edits_length; $ii++) {186$c = $edits[$ii];187if ($c == 's') {188$diff->addPart('=', $u_parts[$u_pos]);189$u_pos++;190$v_pos++;191} else if ($c == 'd') {192$diff->addPart('-', $u_parts[$u_pos]);193$u_pos++;194} else if ($c == 'i') {195$diff->addPart('+', $v_parts[$v_pos]);196$v_pos++;197} else if ($c == 'x') {198$diff->addPart('-', $u_parts[$u_pos]);199$diff->addPart('+', $v_parts[$v_pos]);200$u_pos++;201$v_pos++;202} else {203throw new Exception(204pht(205'Unexpected character ("%s") in edit string.',206$c));207}208}209210return array($diff, $matrix->didReachMaximumLength());211}212213private function newHashDiff(array $u_parts, array $v_parts) {214215$u_ref = new PhabricatorDocumentRef();216$v_ref = new PhabricatorDocumentRef();217218$u_blocks = $this->newDocumentEngineBlocks($u_parts);219$v_blocks = $this->newDocumentEngineBlocks($v_parts);220221$rows = id(new PhabricatorDocumentEngineBlocks())222->addBlockList($u_ref, $u_blocks)223->addBlockList($v_ref, $v_blocks)224->newTwoUpLayout();225226$diff = new PhutilProseDiff();227foreach ($rows as $row) {228list($u_block, $v_block) = $row;229230if ($u_block && $v_block) {231if ($u_block->getDifferenceType() === '-') {232$diff->addPart('-', $u_block->getContent());233$diff->addPart('+', $v_block->getContent());234} else {235$diff->addPart('=', $u_block->getContent());236}237} else if ($u_block) {238$diff->addPart('-', $u_block->getContent());239} else {240$diff->addPart('+', $v_block->getContent());241}242}243244return $diff;245}246247private function newDocumentEngineBlocks(array $parts) {248$blocks = array();249250foreach ($parts as $part) {251$hash = PhabricatorHash::digestForIndex($part);252253$blocks[] = id(new PhabricatorDocumentEngineBlock())254->setContent($part)255->setDifferenceHash($hash);256}257258return $blocks;259}260261public static function trimApart($input) {262// Split pieces into separate text and whitespace sections: make one263// piece out of all the whitespace at the beginning, one piece out of264// all the actual text in the middle, and one piece out of all the265// whitespace at the end.266267$parts = array();268269$length = strlen($input);270271$corpus = ltrim($input);272$l_length = strlen($corpus);273if ($l_length !== $length) {274$parts[] = substr($input, 0, $length - $l_length);275}276277$corpus = rtrim($corpus);278$lr_length = strlen($corpus);279280if ($lr_length) {281$parts[] = $corpus;282}283284if ($lr_length !== $l_length) {285// NOTE: This will be a negative value; we're slicing from the end of286// the input string.287$parts[] = substr($input, $lr_length - $l_length);288}289290return $parts;291}292293}294295296