mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Math
synced 2024-12-21 03:42:46 +00:00
c27bc62ee7
* for evaluation * formats to bracket format as used in RTED java application Change-Id: Ia85b7fdac55a9acfbc48e266a189e733634752f9
333 lines
9 KiB
PHP
333 lines
9 KiB
PHP
<?php
|
|
|
|
namespace MediaWiki\Extension\Math\WikiTexVC\MMLmappings\Util;
|
|
|
|
use MediaWiki\Extension\Math\WikiTexVC\XMLNode;
|
|
|
|
/**
|
|
* Algorithm to make a simple, but customizable comparison of two MathML-Strings for automated testing.
|
|
* It compares all element keys in the tree without order and calculates an F-score based on similarity.
|
|
* Also, it gives information the elements which differ both trees.
|
|
*
|
|
* @author Johannes Stegmüller
|
|
*/
|
|
class MMLComparator {
|
|
|
|
/**
|
|
* These keys for mathml-elements get not
|
|
* considered for similarity comparison.
|
|
*/
|
|
private const IGNOREDELEMENTKEYS = [
|
|
"semantics",
|
|
"annotation",
|
|
"annotation-xml",
|
|
"csymbol",
|
|
"mrow",
|
|
"mstyle"
|
|
];
|
|
|
|
/**
|
|
* Although the keys mentioned here may be on the ignore list.
|
|
* Keys containing one of the specific attribute get checked.
|
|
*/
|
|
private const CHECKEXPLICITLY = [
|
|
"mstyle" => [ "mathcolor" ]
|
|
];
|
|
|
|
/**
|
|
* Attributes which are not considered for similarity checks.
|
|
* Attributes are always defined for a specific mathml key.
|
|
*/
|
|
private const IGNOREDATTRIBUTES = [
|
|
"mrow" => [ "class", "data-mjx-texclass" ],
|
|
"math" => [ "alttext", "display" ],
|
|
"mi" => [ "class", "data-mjx-variant", "mathvariant", "data-mjx-texclass", "data-mjx-alternate" ],
|
|
"mo" => [ "data-mjx-pseudoscript", "stretchy", "fence", "data-mjx-texclass", "texClass", "class",
|
|
"data-mjx-alternate", "form", "accent", "lspace", "rspace", "xref", "id" ],
|
|
"mtext" => [ "texClass", "class" ],
|
|
"mspace" => [ "width" ]
|
|
];
|
|
|
|
public static function functionObtainTreeInBrackets( $mml ) {
|
|
$xml = simplexml_load_string( $mml );
|
|
$nodes = self::xmlToNode( $xml );
|
|
return self::convertToBracketFormat( $nodes );
|
|
}
|
|
|
|
/**
|
|
* Compares the base to the comparison MathML.
|
|
* The base is considered the reference MathML. Calculates an F-Score for similarity of elements.
|
|
* MathML elements which are not considered for similarity checks can be specified within this class.
|
|
* Gives back the differing elements relative to the base.
|
|
* If any issue with the input, the returned F-Score has value '-0.0'
|
|
*
|
|
* @param string $mmlBase MathML base element as string
|
|
* @param string $mmlComp MathML comparison element as string
|
|
* @return array with 'similarityF': Calculated F-Score for similarity, 'more/less': diffs relative to the base.
|
|
*/
|
|
public function compareMathML( $mmlBase, $mmlComp ): array {
|
|
$compRes = [
|
|
"similarityF" => -0.0,
|
|
"more" => [],
|
|
"less" => []
|
|
];
|
|
|
|
if ( !$mmlBase || !$mmlComp || trim( $mmlBase ) == "" || trim( $mmlComp ) == "" ) {
|
|
return $compRes;
|
|
}
|
|
|
|
$xmlBase = simplexml_load_string( $mmlBase );
|
|
$xmlComp = simplexml_load_string( $mmlComp );
|
|
|
|
// Create flattened Arrays of MathML
|
|
$mbase = $this->createComparisonArray( $xmlBase );
|
|
$mcomp = $this->createComparisonArray( $xmlComp );
|
|
|
|
// Compare base and comp arrays, tbd: eventually extract this function
|
|
$countsBase = $this->countArraySize( $mbase );
|
|
$countsComp = $this->countArraySize( $mcomp );
|
|
$overallRelevantSum = $countsBase[0];
|
|
$overallRetrievedSum = $countsComp[0];
|
|
|
|
$ctrSame = $this->compareMathMLKeyArrays( $compRes, $mbase, $mcomp );
|
|
|
|
$compRes['similarityF'] = $this->calculateFscore( $ctrSame, $overallRelevantSum, $overallRetrievedSum );
|
|
return $compRes;
|
|
}
|
|
|
|
public static function xmlToNode( $xml ) {
|
|
$node = new XMLNode( $xml->getName() );
|
|
|
|
foreach ( $xml->children() as $child ) {
|
|
$node->children[] = self::xmlToNode( $child );
|
|
}
|
|
|
|
return $node;
|
|
}
|
|
|
|
private function treeEditDistance( $root1, $root2 ) {
|
|
$n = count( $root1->children );
|
|
$m = count( $root2->children );
|
|
|
|
$dp = [];
|
|
for ( $i = 0; $i <= $n; $i++ ) {
|
|
$dp[$i] = [];
|
|
for ( $j = 0; $j <= $m; $j++ ) {
|
|
$dp[$i][$j] = 0;
|
|
}
|
|
}
|
|
|
|
for ( $i = 0; $i <= $n; $i++ ) {
|
|
for ( $j = 0; $j <= $m; $j++ ) {
|
|
if ( $i == 0 ) {
|
|
$dp[$i][$j] = $j;
|
|
} elseif ( $j == 0 ) {
|
|
$dp[$i][$j] = $i;
|
|
} elseif ( $root1->children[$i - 1]->value == $root2->children[$j - 1]->value ) {
|
|
$dp[$i][$j] = $dp[$i - 1][$j - 1];
|
|
} else {
|
|
$dp[$i][$j] = 1 + min(
|
|
$dp[$i][$j - 1], // Insert
|
|
$dp[$i - 1][$j], // Remove
|
|
$dp[$i - 1][$j - 1] // Replace
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
$ted = $dp[$n][$m];
|
|
$totalNodes = $n + $m; // Total nodes in both trees
|
|
if ( $totalNodes != 0 ) {
|
|
$normalizedTED = $ted / $totalNodes;
|
|
} else {
|
|
$normalizedTED = -1;
|
|
}
|
|
|
|
return [ "TED" => $ted, "normalizedTED" => $normalizedTED ];
|
|
}
|
|
|
|
public static function convertToBracketFormat( $root ) {
|
|
if ( empty( $root->children ) ) {
|
|
return "{" . $root->value . "}";
|
|
}
|
|
|
|
$result = $root->value . '{';
|
|
|
|
foreach ( $root->children as $child ) {
|
|
$result .= self::convertToBracketFormat( $child );
|
|
}
|
|
|
|
$result .= '}';
|
|
return $result;
|
|
}
|
|
|
|
private function compareMathMLKeyArrays( &$compRes, $mbase, $mcomp ) {
|
|
$intersections = 0;
|
|
foreach ( $mbase as $key => $baseElement ) {
|
|
$compElement = $mcomp[$key] ?? null;
|
|
if ( $compElement == null ) {
|
|
// The base has this mml element(s), but not the comparison
|
|
continue;
|
|
}
|
|
$compRet = $this->compareArrays( $baseElement, $compElement );
|
|
$intersections += $compRet['sameCtr'];
|
|
if ( count( $compRet['more'] ) > 0 ) {
|
|
$compRes['more'][$key] = $compRet['more'];
|
|
}
|
|
if ( count( $compRet['less'] ) > 0 ) {
|
|
$compRes['less'][$key] = $compRet['less'];
|
|
}
|
|
}
|
|
return $intersections;
|
|
}
|
|
|
|
private function calculateFscore( $intersection, $sumRelevant, $sumRetrieved ) {
|
|
if ( $sumRelevant == 0 && $sumRetrieved == 0 ) {
|
|
return 1.0;
|
|
}
|
|
|
|
if ( $sumRetrieved == 0 && $sumRelevant != 0 ) {
|
|
return 0;
|
|
}
|
|
|
|
if ( $sumRelevant == 0 && $sumRetrieved != 0 ) {
|
|
$recall = 1;
|
|
} else {
|
|
$recall = $intersection / $sumRelevant;
|
|
}
|
|
|
|
$prec = $intersection / $sumRetrieved;
|
|
if ( ( $prec + $recall ) == 0 ) {
|
|
return 0;
|
|
}
|
|
// Calculate F-Score
|
|
return 2 * ( ( $prec * $recall ) / ( $prec + $recall ) );
|
|
}
|
|
|
|
private function countArraySize( $arr ): array {
|
|
$overallSize = 0;
|
|
$overallAttrs = 0;
|
|
foreach ( $arr as $element ) {
|
|
foreach ( $element as $attrs ) {
|
|
$overallSize += 1;
|
|
$overallAttrs += count( $attrs );
|
|
}
|
|
}
|
|
|
|
return [ $overallSize, $overallAttrs ];
|
|
}
|
|
|
|
private function compareArrays( $base, $comp ): array {
|
|
$sameCtr = 0;
|
|
$compRet = [
|
|
"sameCtr" => 0,
|
|
"less" => [],
|
|
"more" => [],
|
|
];
|
|
|
|
// Return score how many same or very similar items are in array
|
|
foreach ( $base as $keyBase => $baseEl ) {
|
|
foreach ( $comp as $keyComp => $compEl ) {
|
|
if ( $compEl == -1 ) {
|
|
continue;
|
|
}
|
|
if ( $this->compareTwoBaseElements( $compEl, $baseEl ) ) {
|
|
$base[$keyBase] = -1;
|
|
$comp[$keyComp] = -1;
|
|
$sameCtr += 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
$compRet['sameCtr'] = $sameCtr;
|
|
$compRet = $this->addRemainingElements( $compRet, 'less', $base );
|
|
return $this->addRemainingElements( $compRet, 'more', $comp );
|
|
}
|
|
|
|
private function addRemainingElements( $compRet, $key, $data ): array {
|
|
foreach ( $data as $el ) {
|
|
if ( $el !== null && $el != -1 ) {
|
|
$compRet[$key][] = $el;
|
|
}
|
|
}
|
|
return $compRet;
|
|
}
|
|
|
|
private function compareTwoBaseElements( $compEl, $baseEl ): bool {
|
|
// Most basic comparison, this works for elements without attributes
|
|
if ( $compEl === $baseEl ) {
|
|
return true;
|
|
}
|
|
// The array and sub-elements are exactly the same
|
|
if ( array_diff( $compEl, $baseEl ) == [] && array_diff( $baseEl, $compEl ) == [] ) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
private function createComparisonArray( $xml ): array {
|
|
$allKeys = [];
|
|
$this->prepareMMLElements( $xml, $allKeys );
|
|
$finalMMLKeys = $this->filterKeys( $allKeys );
|
|
return $finalMMLKeys;
|
|
}
|
|
|
|
private function filterKeys( $allMMLElements ): array {
|
|
$finalKeys = [];
|
|
foreach ( $allMMLElements as $key => $element ) {
|
|
if ( in_array( $key, self::IGNOREDELEMENTKEYS, true ) ) {
|
|
$finalKeys = $this->checkExplicitKeys( $key, $element, $finalKeys );
|
|
continue;
|
|
}
|
|
$finalKeys[$key] = $element;
|
|
}
|
|
return $finalKeys;
|
|
}
|
|
|
|
private function prepareMMLElements( $xml, &$finalArray, $alwaysSetVal = false ) {
|
|
foreach ( $xml as $key => $element ) {
|
|
if ( $element instanceof \SimpleXMLElement ) {
|
|
$el = $this->filterAttributes( $key, $element->attributes() );
|
|
$value = trim( (string)$element );
|
|
if ( $alwaysSetVal || $value !== "" ) {
|
|
$el["val"] = $value;
|
|
}
|
|
$finalArray[$key][] = $el ?? [];
|
|
|
|
} else {
|
|
$finalArray[$key][] = $element;
|
|
}
|
|
$this->prepareMMLElements( $element, $finalArray );
|
|
}
|
|
}
|
|
|
|
private function filterAttributes( $elementKey, $attributes ): array {
|
|
$ignoredAttrs = self::IGNOREDATTRIBUTES[$elementKey] ?? [];
|
|
$finalAttributes = [];
|
|
foreach ( $attributes as $akey => $aval ) {
|
|
|
|
if ( in_array( $akey, $ignoredAttrs, true ) ) {
|
|
continue;
|
|
}
|
|
$finalAttributes[$akey] = $aval[0];
|
|
}
|
|
return $finalAttributes;
|
|
}
|
|
|
|
private function checkExplicitKeys( $key, $element, array $finalKeys ): array {
|
|
$check = self::CHECKEXPLICITLY[$key] ?? null;
|
|
if ( $check ) {
|
|
foreach ( $element as $added ) {
|
|
foreach ( $check as $checkEl ) {
|
|
if ( array_key_exists( $checkEl, $added ) ) {
|
|
$finalKeys[$key] = $element;
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
return $finalKeys;
|
|
}
|
|
}
|