Add MathML-Comparator algorithm for the automated tests

* Compares two MathML trees
* Calculates F-Score and Diffs

Bug: T885309
Change-Id: Iabdb5b12054e4c1ce6e2c756643bafd371da6fb0
This commit is contained in:
Stegmujo 2023-01-31 18:05:13 +01:00
parent 095cec57c3
commit a1cddd7e0d

View file

@ -0,0 +1,253 @@
<?php
namespace MediaWiki\Extension\Math\TexVC\MMLmappings\Util;
/**
* Algorithm to make a simple, but customizable comparison of two MathML-Strings for automated testing.
* It compares all element keys in the tree without order and calculates an F-score based on similarity.
* Also, it gives information the elements which differ both trees.
*
* @author Johannes Stegmüller
*/
class MMLComparator {
/**
* These keys for mathml-elements get not
* considered for similarity comparison.
*/
private const IGNOREDELEMENTKEYS = [
"semantics",
"annotation",
"mrow",
"mstyle"
];
/**
* Although the keys mentioned here may be on the ignore list.
* Keys containing one of the specific attribute get checked.
*/
private const CHECKEXPLICITLY = [
"mstyle" => [ "mathcolor" ]
];
/**
* Attributes which are not considered for similarity checks.
* Attributes are always defined for a specific mathml key.
*/
private const IGNOREDATTRIBUTES = [
"mrow" => [ "class", "data-mjx-texclass" ],
"math" => [ "alttext", "display" ],
"mi" => [ "class", "data-mjx-variant", "data-mjx-texclass", ],
"mo" => [ "data-mjx-pseudoscript", "stretchy", "fence", "data-mjx-texclass", "texClass" ]
];
/**
* Compares the base to the comparison MathML.
* The base is considered the reference MathML. Calculates an F-Score for similarity of elements.
* MathML elements which are not considered for similarity checks can be specified within this class.
* Gives back the differing elements relative to the base.
* If any issue with the input, the returned F-Score has value '-0.0'
*
* @param string $mmlBase MathML base element as string
* @param string $mmlComp MathML comparison element as string
* @return array with 'similarityF': Calculated F-Score for similarity, 'more/less': diffs relative to the base.
*/
public function compareMathML( $mmlBase, $mmlComp ): array {
$compRes = [
"similarityF" => -0.0,
"more" => [],
"less" => []
];
if ( !$mmlBase || !$mmlComp || trim( $mmlBase ) == "" || trim( $mmlComp ) == "" ) {
return $compRes;
}
$xmlBase = simplexml_load_string( $mmlBase );
$xmlComp = simplexml_load_string( $mmlComp );
// Create flattened Arrays of MathML
$mbase = $this->createComparisonArray( $xmlBase );
$mcomp = $this->createComparisonArray( $xmlComp );
// Compare base and comp arrays, tbd: eventually extract this function
$countsBase = $this->countArraySize( $mbase );
$countsComp = $this->countArraySize( $mcomp );
$overallRelevantSum = $countsBase[0];
$overallRetrievedSum = $countsComp[0];
$ctrSame = $this->compareMathMLKeyArrays( $compRes, $mbase, $mcomp );
$compRes['similarityF'] = $this->calculateFscore( $ctrSame, $overallRelevantSum, $overallRetrievedSum );
return $compRes;
}
private function compareMathMLKeyArrays( &$compRes, $mbase, $mcomp ) {
$intersections = 0;
foreach ( $mbase as $key => $baseElement ) {
$compElement = $mcomp[$key] ?? null;
if ( $compElement == null ) {
// The base has this mml element(s), but not the comparison
continue;
}
$compRet = $this->compareArrays( $baseElement, $compElement );
$intersections += $compRet['sameCtr'];
if ( count( $compRet['more'] ) > 0 ) {
$compRes['more'][$key] = $compRet['more'];
}
if ( count( $compRet['less'] ) > 0 ) {
$compRes['less'][$key] = $compRet['less'];
}
}
return $intersections;
}
private function calculateFscore( $intersection, $sumRelevant, $sumRetrieved ) {
if ( $sumRelevant == 0 && $sumRetrieved == 0 ) {
return 1.0;
}
if ( $sumRetrieved == 0 && $sumRelevant != 0 ) {
return 0;
}
if ( $sumRelevant == 0 && $sumRetrieved != 0 ) {
$recall = 1;
} else {
$recall = $intersection / $sumRelevant;
}
$prec = $intersection / $sumRetrieved;
if ( ( $prec + $recall ) == 0 ) {
return 0;
}
// Calculate F-Score
return 2 * ( ( $prec * $recall ) / ( $prec + $recall ) );
}
private function countArraySize( $arr ): array {
$overallSize = 0;
$overallAttrs = 0;
foreach ( $arr as $element ) {
foreach ( $element as $attrs ) {
$overallSize += 1;
$overallAttrs += count( $attrs );
}
}
return [ $overallSize,$overallAttrs ];
}
private function compareArrays( $base, $comp ): array {
$sameCtr = 0;
$compRet = [
"sameCtr" => 0,
"less" => [],
"more" => [],
];
// Return score how many same or very similar items are in array
foreach ( $base as $keyBase => $baseEl ) {
foreach ( $comp as $keyComp => $compEl ) {
if ( $compEl == -1 ) {
continue;
}
if ( $this->compareTwoBaseElements( $compEl, $baseEl ) ) {
$base[$keyBase] = -1;
$comp[$keyComp] = -1;
$sameCtr += 1;
break;
}
}
}
$compRet['sameCtr'] = $sameCtr;
$compRet = $this->addRemainingElements( $compRet, 'less', $base );
return $this->addRemainingElements( $compRet, 'more', $comp );
}
private function addRemainingElements( $compRet, $key, $data ): array {
foreach ( $data as $el ) {
if ( $el !== null && $el != -1 ) {
$compRet[$key][] = $el;
}
}
return $compRet;
}
private function compareTwoBaseElements( $compEl, $baseEl ): bool {
// Most basic comparison, this works for elements without attributes
if ( $compEl === $baseEl ) {
return true;
}
// The array and sub-elements are exactly the same
if ( array_diff( $compEl, $baseEl ) == [] && array_diff( $baseEl, $compEl ) == [] ) {
return true;
}
return false;
}
private function createComparisonArray( $xml ): array {
$allKeys = [];
$this->prepareMMLElements( $xml, $allKeys );
$finalMMLKeys = $this->filterKeys( $allKeys );
return $finalMMLKeys;
}
private function filterKeys( $allMMLElements ): array {
$finalKeys = [];
foreach ( $allMMLElements as $key => $element ) {
if ( in_array( $key, self::IGNOREDELEMENTKEYS ) ) {
$finalKeys = $this->checkExplicitKeys( $key, $element, $finalKeys );
continue;
}
$finalKeys[$key] = $element;
}
return $finalKeys;
}
private function prepareMMLElements( $xml, &$finalArray, $alwaysSetVal = false ) {
foreach ( $xml as $key => $element ) {
if ( $element instanceof \SimpleXMLElement ) {
$el = $this->filterAttributes( $key, $element->attributes() );
$value = trim( (string)$element );
if ( $alwaysSetVal || $value !== "" ) {
$el["val"] = $value;
}
$finalArray[$key][] = $el ?? [];
} else {
$finalArray[$key][] = $element;
}
$this->prepareMMLElements( $element, $finalArray );
}
}
private function filterAttributes( $elementKey, $attributes ): array {
$ignoredAttrs = self::IGNOREDATTRIBUTES[$elementKey] ?? [];
$finalAttributes = [];
foreach ( $attributes as $akey => $aval ) {
if ( in_array( $akey, $ignoredAttrs ) ) {
continue;
}
$finalAttributes[$akey] = $aval[0];
}
return $finalAttributes;
}
private function checkExplicitKeys( $key, $element, array $finalKeys ): array {
$check = self::CHECKEXPLICITLY[$key] ?? null;
if ( $check ) {
foreach ( $element as $added ) {
foreach ( $check as $checkEl ) {
if ( array_key_exists( $checkEl, $added ) ) {
$finalKeys[$key] = $element;
}
}
}
}
return $finalKeys;
}
}