From a1cddd7e0de0b9c04a1c932d11b40da3677eb481 Mon Sep 17 00:00:00 2001 From: Stegmujo Date: Tue, 31 Jan 2023 18:05:13 +0100 Subject: [PATCH] Add MathML-Comparator algorithm for the automated tests * Compares two MathML trees * Calculates F-Score and Diffs Bug: T885309 Change-Id: Iabdb5b12054e4c1ce6e2c756643bafd371da6fb0 --- tests/phpunit/unit/TexVC/MMLComparator.php | 253 +++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 tests/phpunit/unit/TexVC/MMLComparator.php diff --git a/tests/phpunit/unit/TexVC/MMLComparator.php b/tests/phpunit/unit/TexVC/MMLComparator.php new file mode 100644 index 000000000..f6f8f686e --- /dev/null +++ b/tests/phpunit/unit/TexVC/MMLComparator.php @@ -0,0 +1,253 @@ + [ "mathcolor" ] + ]; + + /** + * Attributes which are not considered for similarity checks. + * Attributes are always defined for a specific mathml key. + */ + private const IGNOREDATTRIBUTES = [ + "mrow" => [ "class", "data-mjx-texclass" ], + "math" => [ "alttext", "display" ], + "mi" => [ "class", "data-mjx-variant", "data-mjx-texclass", ], + "mo" => [ "data-mjx-pseudoscript", "stretchy", "fence", "data-mjx-texclass", "texClass" ] + ]; + + /** + * Compares the base to the comparison MathML. + * The base is considered the reference MathML. Calculates an F-Score for similarity of elements. + * MathML elements which are not considered for similarity checks can be specified within this class. + * Gives back the differing elements relative to the base. + * If any issue with the input, the returned F-Score has value '-0.0' + * + * @param string $mmlBase MathML base element as string + * @param string $mmlComp MathML comparison element as string + * @return array with 'similarityF': Calculated F-Score for similarity, 'more/less': diffs relative to the base. + */ + public function compareMathML( $mmlBase, $mmlComp ): array { + $compRes = [ + "similarityF" => -0.0, + "more" => [], + "less" => [] + ]; + + if ( !$mmlBase || !$mmlComp || trim( $mmlBase ) == "" || trim( $mmlComp ) == "" ) { + return $compRes; + } + + $xmlBase = simplexml_load_string( $mmlBase ); + $xmlComp = simplexml_load_string( $mmlComp ); + + // Create flattened Arrays of MathML + $mbase = $this->createComparisonArray( $xmlBase ); + $mcomp = $this->createComparisonArray( $xmlComp ); + + // Compare base and comp arrays, tbd: eventually extract this function + $countsBase = $this->countArraySize( $mbase ); + $countsComp = $this->countArraySize( $mcomp ); + $overallRelevantSum = $countsBase[0]; + $overallRetrievedSum = $countsComp[0]; + + $ctrSame = $this->compareMathMLKeyArrays( $compRes, $mbase, $mcomp ); + + $compRes['similarityF'] = $this->calculateFscore( $ctrSame, $overallRelevantSum, $overallRetrievedSum ); + return $compRes; + } + + private function compareMathMLKeyArrays( &$compRes, $mbase, $mcomp ) { + $intersections = 0; + foreach ( $mbase as $key => $baseElement ) { + $compElement = $mcomp[$key] ?? null; + if ( $compElement == null ) { + // The base has this mml element(s), but not the comparison + continue; + } + $compRet = $this->compareArrays( $baseElement, $compElement ); + $intersections += $compRet['sameCtr']; + if ( count( $compRet['more'] ) > 0 ) { + $compRes['more'][$key] = $compRet['more']; + } + if ( count( $compRet['less'] ) > 0 ) { + $compRes['less'][$key] = $compRet['less']; + } + } + return $intersections; + } + + private function calculateFscore( $intersection, $sumRelevant, $sumRetrieved ) { + if ( $sumRelevant == 0 && $sumRetrieved == 0 ) { + return 1.0; + } + + if ( $sumRetrieved == 0 && $sumRelevant != 0 ) { + return 0; + } + + if ( $sumRelevant == 0 && $sumRetrieved != 0 ) { + $recall = 1; + } else { + $recall = $intersection / $sumRelevant; + } + + $prec = $intersection / $sumRetrieved; + if ( ( $prec + $recall ) == 0 ) { + return 0; + } + // Calculate F-Score + return 2 * ( ( $prec * $recall ) / ( $prec + $recall ) ); + } + + private function countArraySize( $arr ): array { + $overallSize = 0; + $overallAttrs = 0; + foreach ( $arr as $element ) { + foreach ( $element as $attrs ) { + $overallSize += 1; + $overallAttrs += count( $attrs ); + } + } + + return [ $overallSize,$overallAttrs ]; + } + + private function compareArrays( $base, $comp ): array { + $sameCtr = 0; + $compRet = [ + "sameCtr" => 0, + "less" => [], + "more" => [], + ]; + + // Return score how many same or very similar items are in array + foreach ( $base as $keyBase => $baseEl ) { + foreach ( $comp as $keyComp => $compEl ) { + if ( $compEl == -1 ) { + continue; + } + if ( $this->compareTwoBaseElements( $compEl, $baseEl ) ) { + $base[$keyBase] = -1; + $comp[$keyComp] = -1; + $sameCtr += 1; + break; + } + } + } + $compRet['sameCtr'] = $sameCtr; + $compRet = $this->addRemainingElements( $compRet, 'less', $base ); + return $this->addRemainingElements( $compRet, 'more', $comp ); + } + + private function addRemainingElements( $compRet, $key, $data ): array { + foreach ( $data as $el ) { + if ( $el !== null && $el != -1 ) { + $compRet[$key][] = $el; + } + } + return $compRet; + } + + private function compareTwoBaseElements( $compEl, $baseEl ): bool { + // Most basic comparison, this works for elements without attributes + if ( $compEl === $baseEl ) { + return true; + } + // The array and sub-elements are exactly the same + if ( array_diff( $compEl, $baseEl ) == [] && array_diff( $baseEl, $compEl ) == [] ) { + return true; + } + + return false; + } + + private function createComparisonArray( $xml ): array { + $allKeys = []; + $this->prepareMMLElements( $xml, $allKeys ); + $finalMMLKeys = $this->filterKeys( $allKeys ); + return $finalMMLKeys; + } + + private function filterKeys( $allMMLElements ): array { + $finalKeys = []; + foreach ( $allMMLElements as $key => $element ) { + if ( in_array( $key, self::IGNOREDELEMENTKEYS ) ) { + $finalKeys = $this->checkExplicitKeys( $key, $element, $finalKeys ); + continue; + } + $finalKeys[$key] = $element; + } + return $finalKeys; + } + + private function prepareMMLElements( $xml, &$finalArray, $alwaysSetVal = false ) { + foreach ( $xml as $key => $element ) { + if ( $element instanceof \SimpleXMLElement ) { + $el = $this->filterAttributes( $key, $element->attributes() ); + $value = trim( (string)$element ); + if ( $alwaysSetVal || $value !== "" ) { + $el["val"] = $value; + } + $finalArray[$key][] = $el ?? []; + + } else { + $finalArray[$key][] = $element; + } + $this->prepareMMLElements( $element, $finalArray ); + } + } + + private function filterAttributes( $elementKey, $attributes ): array { + $ignoredAttrs = self::IGNOREDATTRIBUTES[$elementKey] ?? []; + $finalAttributes = []; + foreach ( $attributes as $akey => $aval ) { + + if ( in_array( $akey, $ignoredAttrs ) ) { + continue; + } + $finalAttributes[$akey] = $aval[0]; + } + return $finalAttributes; + } + + private function checkExplicitKeys( $key, $element, array $finalKeys ): array { + $check = self::CHECKEXPLICITLY[$key] ?? null; + if ( $check ) { + foreach ( $element as $added ) { + foreach ( $check as $checkEl ) { + if ( array_key_exists( $checkEl, $added ) ) { + $finalKeys[$key] = $element; + } + } + + } + } + return $finalKeys; + } +}