Index: lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/MatchRatingApproachTests.java
===================================================================
--- lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/MatchRatingApproachTests.java (revision 0)
+++ lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/MatchRatingApproachTests.java (revision 0)
@@ -0,0 +1,913 @@
+package org.apache.lucene.analysis.phonetic;
+
+import static org.junit.Assert.*;
+import static org.junit.Assert.assertTrue;
+
+import org.junit.Test;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Series of tests for the Match Rating Approach algorithm
+ *
+ * General naming nomeneclature for the test is of the form:
+ * GeneralMetadataOnTheTestArea_ActualTestValues_ExpectedResult
+ *
+ * An unusual value is indicated by the term "corner case"
+ */
+public class MatchRatingApproachTests {
+
+
+ @Test
+ public void MRA_Test_AccentRemoval_AllLower_SuccessfullyRemoved()
+ {
+ //Arrange
+ String testInput = "áéíóú";
+
+ //Assert
+ String actualRes = MatchRatingApproach.RemoveAccents(testInput);
+ String expectedRes = "aeiou";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+ }
+
+
+ @Test
+ public void MRA_Test_AccentRemoval_WithSpaces_SuccessfullyRemovedAndSpacesInvariant()
+ {
+ //Arrange
+ String testInput = "áé íó ú";
+
+ //Assert
+ String actualRes = MatchRatingApproach.RemoveAccents(testInput);
+ String expectedRes = "ae io u";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+
+ @Test
+ public void MRA_Test_AccentRemoval_UpperandLower_SuccessfullyRemovedAndCaseInvariant()
+ {
+ //Arrange
+ String testInput = "ÁeíÓuu";
+
+ //Assert
+ String actualRes = MatchRatingApproach.RemoveAccents(testInput);
+ String expectedRes = "AeiOuu";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+ @Test
+ public void MRA_Test_AccentRemoval_MixedWithUnusualChars_SuccessfullyRemovedAndUnusualcharactersInvariant()
+ {
+ //Arrange
+ String testInput = "Á-e'í.,ó&ú";
+
+ //Assert
+ String actualRes = MatchRatingApproach.RemoveAccents(testInput);
+ String expectedRes = "A-e'i.,o&u";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+
+ @Test
+ public void MRA_Test_AccentRemoval_GerSpanFrenMix_SuccessfullyRemoved()
+ {
+ //Arrange
+ //alt plus 132 ä (0228)
+ //alt plus 137 ë (0235)
+ //alt plus 148 ö (0246)
+ //alt plus 225 ß (0223)
+ //alt plus 129 ü (0252)
+ //alt plus 142 Ä (0196)
+ //alt plus Ë (0203)
+ //alt plus 153 Ö (0214)
+ //alt plus 154 Ü (0220)
+ //alt plus 164 ñ
+ //alp plus 165 Ñ
+ //alp plus 0224 à
+ String testInput = "äëöüßÄËÖÜñÑà";
+
+ //Assert
+ String actualRes = MatchRatingApproach.RemoveAccents(testInput);
+ String expectedRes = "aeoußAEOUnNa";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+
+ @Test
+ public void MRA_AccentRemovalNormalString_NoChange()
+ {
+ //Arrange
+ String testInput = "Colorless green ideas sleep furiously";
+
+ //Assert
+ String actualRes = MatchRatingApproach.RemoveAccents(testInput);
+ String expectedRes = "Colorless green ideas sleep furiously";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+
+ @Test
+ public void MRA_AccentRemoval_NINO_NoChange()
+ {
+ //Arrange
+ String testInput = "";
+
+ //Assert
+ String actualRes = MatchRatingApproach.RemoveAccents(testInput);
+ String expectedRes = "";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+
+ //***** End Region - Test Accent Removal
+
+
+ //***** Begin Region - Test Get Encoding
+
+ @Test
+ public void MRA_GetEncoding_HARPER_HRPR() {
+
+ //Arrange
+ String testInput = "HARPER";
+
+ //Assert
+ String actualRes = MatchRatingApproach.GetMRA(testInput);
+ String expectedRes = "HRPR";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+
+ @Test
+ public void MRA_GetEncoding_SMITH_to_SMTH()
+ {
+ //Arrange
+ String testInput = "Smith";
+
+ //Assert
+ String actualRes = MatchRatingApproach.GetMRA(testInput);
+ String expectedRes = "SMTH";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+ @Test
+ public void MRA_GetEncoding_SMYTH_to_SMYTH()
+ {
+ //Arrange
+ String testInput = "Smyth";
+
+ //Assert
+ String actualRes = MatchRatingApproach.GetMRA(testInput);
+ String expectedRes = "SMYTH";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+ @Test
+ public void MRA_GetEncoding_Space_to_Nothing()
+ {
+ //Arrange
+ String testInput = " ";
+
+ //Assert
+ String actualRes = MatchRatingApproach.GetMRA(testInput);
+ String expectedRes = "";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+ @Test
+ public void MRA_GetEncoding_NoSpace_to_Nothing()
+ {
+ //Arrange
+ String testInput = "";
+
+ //Assert
+ String actualRes = MatchRatingApproach.GetMRA(testInput);
+ String expectedRes = "";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+ @Test
+ public void MRA_GetEncoding_Null_to_Nothing()
+ {
+ //Arrange
+ String testInput = null;
+
+ //Assert
+ String actualRes = MatchRatingApproach.GetMRA(testInput);
+ String expectedRes = "";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+ @Test
+ public void MRA_GetEncoding_One_Letter_to_Nothing()
+ {
+ //Arrange
+ String testInput = "E";
+
+ //Assert
+ String actualRes = MatchRatingApproach.GetMRA(testInput);
+ String expectedRes = "";
+
+ //Act
+ assertTrue(expectedRes.equals(actualRes));
+
+ }
+
+
+
+ @Test
+ public void MRA_Compare_SMITH_SMYTH_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Smith";
+ String testInput2 = "smyth";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_BURNS_BOURNE_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Burns";
+ String testInput2 = "Bourne";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_ShortNames_AL_ED_WorksButNoMatch()
+ {
+ //Arrange
+ String testInput1 = "Al";
+ String testInput2 = "Ed";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertFalse(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_CATHERINE_KATHRYN_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Catherine";
+ String testInput2 = "Kathryn";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_BRIAN_BRYAN_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Brian";
+ String testInput2 = "Bryan";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_SEAN_SHAUN_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Séan";
+ String testInput2 = "Shaun";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_COLM_COLIN_WithAccentsAndSymbolsAndSpaces_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Cólm. ";
+ String testInput2 = "C-olín";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_STEPHEN_STEVEN_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Stephen";
+ String testInput2 = "Steven";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_STEVEN_STEFAN_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Steven";
+ String testInput2 = "Stefan";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+
+ @Test
+ public void MRA_Compare_STEPHEN_STEFAN_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Stephen";
+ String testInput2 = "Stefan";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_SAM_SAMUEL_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Sam";
+ String testInput2 = "Samuel";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_MICKY_MICHAEL_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Micky";
+ String testInput2 = "Michael";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_OONA_OONAGH_SuccessfulloyMatched()
+ {
+ //Arrange
+ String testInput1 = "Oona";
+ String testInput2 = "Oonagh";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_SOPHIE_SOFIA_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Sophie";
+ String testInput2 = "Sofia";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+
+ @Test
+ public void MRA_Compare_FRANCISZEK_FRANCES_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Franciszek";
+ String testInput2 = "Frances";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_TOMASZ_TOM_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Tomasz";
+ String testInput2 = "tom";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_SmallInput_CARK_Kl_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput2 = "Kl";
+ String testInput1 = "Karl";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_CompareNameToSingleLetter_KARL_C_DoesNotMatch()
+ {
+ //Arrange
+ String testInput2 = "Karl";
+ String testInput1 = "C";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertFalse(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_ZACH_ZAKARIA_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput2 = "Zach";
+ String testInput1 = "Zakaria";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_KARL_ALESSANDRO_DoesNotMatch()
+ {
+ //Arrange
+ String testInput1 = "Karl";
+ String testInput2 = "Alessandro";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertFalse(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_Forenames_UNA_OONAGH_ShouldSuccessfullyMatchButDoesNot()
+ {
+ //Arrange
+ String testInput1 = "Úna";
+ String testInput2 = "Oonagh";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertFalse(actualRes); //To make test pass. Disappointing though.
+
+ }
+
+
+
+ //***** Begin Region - Test Get Encoding - Surnames
+
+ @Test
+ public void MRA_Compare_Surname_OSULLIVAN_OSUILLEABHAIN_SuccessfulMatch()
+ {
+ //Arrange
+ String testInput1 = "O'Sullivan";
+ String testInput2 = "Ó ' Súilleabháin";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_LongSurnames_MORIARTY_OMUIRCHEARTAIGH_DoesNotSuccessfulMatch()
+ {
+ //Arrange
+ String testInput1 = "Moriarty";
+ String testInput2 = "OMuircheartaigh";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertFalse(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_LongSurnames_OMUIRCHEARTAIGH_OMIREADHAIGH_SuccessfulMatch()
+ {
+ //Arrange
+ String testInput1 = "o'muireadhaigh";
+ String testInput2 = "Ó 'Muircheartaigh ";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_Surname_COOPERFLYNN_SUPERLYN_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Cooper-Flynn";
+ String testInput2 = "Super-Lyn";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_Surname_HAILEY_HALLEY_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Hailey";
+ String testInput2 = "Halley";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ // **** BEGIN YIDDISH/SLAVIC SECTION ****
+
+ @Test
+ public void MRA_Compare_Surname_AUERBACH_UHRBACH_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Auerbach";
+ String testInput2 = "Uhrbach";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+ }
+
+
+ @Test
+ public void MRA_Compare_Surname_MOSKOWITZ_MOSKOVITZ_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Moskowitz";
+ String testInput2 = "Moskovitz";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_Surname_LIPSHITZ_LIPPSZYC_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "LIPSHITZ";
+ String testInput2 = "LIPPSZYC";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_Surname_LEWINSKY_LEVINSKI_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "LEWINSKY";
+ String testInput2 = "LEVINSKI";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_Surname_SZLAMAWICZ_SHLAMOVITZ_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "SZLAMAWICZ";
+ String testInput2 = "SHLAMOVITZ ";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_Surname_ROSOCHOWACIEC_ROSOKHOVATSETS_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "R o s o ch o w a c ie c";
+ String testInput2 = " R o s o k ho v a ts e ts";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_Surname_PRZEMYSL_PSHEMESHIL_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = " P rz e m y s l";
+ String testInput2 = " P sh e m e sh i l";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ // **** END YIDDISH/SLAVIC SECTION ****
+
+ @Test
+ public void MRA_Compare_PETERSON_PETERS_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "Peterson";
+ String testInput2 = "Peters";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_MCGOWAN_MCGEOGHEGAN_SuccessfullyMatched()
+ {
+ //Arrange
+ String testInput1 = "McGowan";
+ String testInput2 = "Mc Geoghegan";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_SurnamesCornerCase_MURPHY_Space_NoMatch()
+ {
+ //Arrange
+ String testInput1 = "Murphy";
+ String testInput2 = " ";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertFalse(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_SurnamesCornerCase_MURPHY_NoSpace_NoMatch()
+ {
+ //Arrange
+ String testInput1 = "Murphy";
+ String testInput2 = "";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertFalse(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_SurnameCornerCase_Nulls_NoMatch()
+ {
+ //Arrange
+ String testInput1 = null;
+ String testInput2 = null;
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertFalse(actualRes);
+
+ }
+
+
+
+ @Test
+ public void MRA_Compare_Surnames_MURPHY_LYNCH_NoMatchExpected()
+ {
+ //Arrange
+ String testInput1 = "Murphy";
+ String testInput2 = "Lynch";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertFalse(actualRes);
+
+ }
+
+ @Test
+ public void MRA_Compare_Forenames_SEAN_JOHN_MatchExpected()
+ {
+ //Arrange
+ String testInput1 = "Sean";
+ String testInput2 = "John";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertTrue(actualRes);
+
+ }
+
+
+ @Test
+ public void MRA_Compare_Forenames_SEAN_PETE_NoMatchExpected()
+ {
+ //Arrange
+ String testInput1 = "Sean";
+ String testInput2 = "Pete";
+
+ //Assert
+ boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2);
+
+ //Act
+ assertFalse(actualRes);
+
+ }
+
+}
Property changes on: lucene\analysis\phonetic\src\test\org\apache\lucene\analysis\phonetic\MatchRatingApproachTests.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/MatchRatingApproach.java
===================================================================
--- lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/MatchRatingApproach.java (revision 0)
+++ lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/MatchRatingApproach.java (revision 0)
@@ -0,0 +1,379 @@
+package org.apache.lucene.analysis.phonetic;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//import org.apache.commons.codec.Encoder;
+import java.util.Locale;
+
+import org.apache.commons.codec.EncoderException;
+import org.apache.commons.codec.StringEncoder;
+
+/**
+ * Match Rating Approach Phonetic Algorithm Developed by Western
+ * Airlines in 1977
+ *
+ * Colm Rice (colm_rice at hotmail dot com)
+ * @see Wikipedia -
+ * Match Rating Approach
+ */
+public class MatchRatingApproach implements StringEncoder {
+
+ /*
+ * Variable used in the Match Rating Approach Algorithm
+ */
+ private static int SIX = 6;
+
+ /**
+ * Creates an instance of the Match Rating Approach encoder
+ */
+ public MatchRatingApproach() {
+ super();
+ }
+
+ /**
+ * Obtains the Match Rating Approach (MRA) encoding of a String. This
+ * algorithm is broadly similar to soundex but better. Input is upper-cased
+ * and cleaned before main processing begins. By cleaned what happens is: a).
+ * Punctuation removed (not everything but the most common ones) b). Any
+ * numbers remove c). Accents removed. d). Remove any spaces
+ *
+ * Limitations: Input format is expected to be a single ASCII word (i.e. a
+ * name) with at least 2 characters in the range [AZ], no numbers.
+ *
+ * Improvements: Punctuation removal is just the most common symbols. I guess
+ * this could be improved.
+ *
+ * Known Mismatches: Did not match the Irish name: Úna with the Welsh
+ * equivalent: Oonagh. Disappointing but no show stopper.
+ *
+ * @param name
+ * String to obtain the MRA encoding for
+ * @return An MRA code corresponding to the String supplied
+ */
+ public static String GetMRA(String name) {
+ // Bulletproof for trivial input - NINO
+ if (name == null || "".equalsIgnoreCase(name) || " ".equalsIgnoreCase(name)
+ || name.length() == 1) return "";
+
+ // Preprocessing
+ name = CleanName(name);
+
+ // BEGIN: Actual encoding part of the algorithm...
+ // 1. Delete all vowels unless the vowel begins the word
+ name = RemoveVowels(name);
+
+ // 2. Remove second consonant from any double consonant
+ name = RemoveDoubleConsonants(name);
+
+ // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
+ name = GetFirst3Last3(name);
+
+ return name;
+ }
+
+ /**
+ * Determines if 2 names are homophonous via the Match Rating Approach (MRA)
+ * algorithm
+ *
+ * It should be noted that the strings are cleaned in the same way as above
+ *
+ * @param name1
+ * First of the 2 strings (names) to compare
+ * @param name2
+ * Second of the 2 names to compare
+ * @return true if the encodings are identical false
+ * otherwise.
+ */
+ public static boolean CompareMRA(String name1, String name2) {
+ // Bulletproof for trivial input - NINO
+ if (name1 == null || "".equalsIgnoreCase(name1)
+ || " ".equalsIgnoreCase(name1)) return false;
+ else if (name2 == null || "".equalsIgnoreCase(name2)
+ || " ".equalsIgnoreCase(name2)) return false;
+ else if (name1.length() == 1 || name2.length() == 1) return false;
+ else if (name1.equalsIgnoreCase(name2)) return true;
+
+ // Preprocessing
+ name1 = CleanName(name1);
+ name2 = CleanName(name2);
+
+ // Actual MRA Algorithm
+
+ // 1. Remove vowels
+ name1 = RemoveVowels(name1);
+ name2 = RemoveVowels(name2);
+
+ // 2. Remove double consonants
+ name1 = RemoveDoubleConsonants(name1);
+ name2 = RemoveDoubleConsonants(name2);
+
+ // 3. Reduce down to 3 letters
+ name1 = GetFirst3Last3(name1);
+ name2 = GetFirst3Last3(name2);
+
+ // 4. Check for length difference - if 3 or greater then no similarity
+ // comparison is done
+ if (Math.abs(name1.length() - name2.length()) >= 3) return false;
+
+ // 5. Obtain the minimum rating value by calculating the length sum of the
+ // encoded Strings and sending it down.
+ int sumLength = Math.abs(name1.length() + name2.length());
+ int minRating = 0;
+ minRating = GetMinRating(sumLength);
+
+ // 6. Process the encoded Strings from left to right and remove any
+ // identical characters found from both Strings respectively.
+ int count = LeftToRightThenRightToLeftProcessing(name1, name2);
+
+ // 7. Each PNI item that has a similarity rating equal to or greater than
+ // the min is considered to be a good candidate match
+ return (count >= minRating) ? true : false;
+
+ }
+
+ // Delete all vowels unless the vowel begins the word
+ private static String RemoveVowels(String name) {
+ // Extract first letter
+ String firstLetter = name.substring(0, 1);
+
+ name = name.replaceAll("A", "");
+ name = name.replaceAll("E", "");
+ name = name.replaceAll("I", "");
+ name = name.replaceAll("O", "");
+ name = name.replaceAll("U", "");
+
+ name = name.replaceAll("\\s{2,}\\b", " ");
+
+ return IsVowel(firstLetter) ? (firstLetter + name) : name;
+ }
+
+ // Determines if a letter is a vowel. Returns true if this is the case, else
+ // false
+ private static boolean IsVowel(String letter) {
+ // e=12.702%; a=8.167%; o=7.507%; i=6.996%; u=2.758% English letter
+ // frequency distribution
+ if (letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A")
+ || letter.equalsIgnoreCase("O") || letter.equalsIgnoreCase("I")
+ || letter.equalsIgnoreCase("U")) {
+ return true;
+ } else {
+ return false;
+ }
+
+ }
+
+ // Replace any double consonant pair with the single letter equivalent
+ private static String RemoveDoubleConsonants(String name) {
+ name = name.replaceAll("AA", "A");
+ name = name.replaceAll("BB", "B");
+ name = name.replaceAll("CC", "C");
+ name = name.replaceAll("DD", "D");
+ name = name.replaceAll("EE", "E");
+ name = name.replaceAll("FF", "F");
+ name = name.replaceAll("GG", "G");
+ name = name.replaceAll("HH", "H");
+ name = name.replaceAll("II", "I");
+ name = name.replaceAll("JJ", "J");
+ name = name.replaceAll("KK", "K");
+ name = name.replaceAll("LL", "L");
+ name = name.replaceAll("MM", "M");
+ name = name.replaceAll("NN", "N");
+ name = name.replaceAll("OO", "O");
+ name = name.replaceAll("PP", "P");
+ name = name.replaceAll("QQ", "Q");
+ name = name.replaceAll("RR", "R");
+ name = name.replaceAll("SS", "S");
+ name = name.replaceAll("TT", "T");
+ name = name.replaceAll("UU", "U");
+ name = name.replaceAll("VV", "V");
+ name = name.replaceAll("WW", "W");
+ name = name.replaceAll("XX", "X");
+ name = name.replaceAll("YY", "Y");
+ name = name.replaceAll("ZZ", "Z");
+
+ return name;
+
+ }
+
+ // Gets the first 3 and last 3 letters of a name (if greater than 6
+ // characters) else returns the name
+ private static String GetFirst3Last3(String name) {
+ int nameLength = name.length();
+ return (nameLength > 6) ? name.substring(0, 3)
+ + name.substring(nameLength - 3, nameLength) : name;
+
+ }
+
+ // Obtains the min rating (values from documentation) of the length sum of the
+ // 2 names.
+ private static int GetMinRating(int sumLength) {
+ int minRating = 0;
+
+ if (sumLength <= 4) {
+ minRating = 5;
+ } else if ((sumLength >= 5) && (sumLength <= 7)) {
+ minRating = 4;
+ } else if ((sumLength >= 8) && (sumLength <= 11)) {
+ minRating = 3;
+ } else if (sumLength == 12) {
+ minRating = 2;
+ } else {
+ minRating = 1; // not sure tbh - documentation said little on this. Unlike
+ // to occur I reckon...
+ }
+
+ return minRating;
+ }
+
+ // Processes the names from left to right (first) then right to left removing
+ // identical letters in same positions
+ // Then subtracts the longer string that remains from 6 and returns this int
+ // value.
+ private static int LeftToRightThenRightToLeftProcessing(String name1,
+ String name2) {
+ char[] name1Char = name1.toCharArray();
+ char[] name2Char = name2.toCharArray();
+
+ int name1Size = name1.length() - 1;
+ int name2Size = name2.length() - 1;
+
+ String name1LtRStart = "";
+ String name1LtREnd = "";
+
+ String name2RtLStart = "";
+ String name2RtLEnd = "";
+
+ for (int i = 0; i < name1Char.length; i++) {
+ if (i > name2Size) break;
+
+ name1LtRStart = name1.substring(i, i + 1);
+ name1LtREnd = name1.substring(name1Size - i, (name1Size - i) + 1);
+
+ name2RtLStart = name2.substring(i, i + 1);
+ name2RtLEnd = name2.substring(name2Size - i, (name2Size - i) + 1);
+
+ // Left to right...
+ if (name1LtRStart.equals(name2RtLStart)) {
+ name1Char[i] = ' ';
+ name2Char[i] = ' ';
+ }
+
+ // Right to left...
+ if (name1LtREnd.equals(name2RtLEnd)) {
+ name1Char[name1Size - i] = ' ';
+ name2Char[name2Size - i] = ' ';
+ }
+
+ }
+
+ // Char arrays -> string & remove extraneous space
+ String strA = new String(name1Char).replaceAll("\\s+", "");
+ String strB = new String(name2Char).replaceAll("\\s+", "");
+
+ // Final bit - subtract longest string from 6 and return this int value
+ return (strA.length() > strB.length()) ? Math.abs(SIX - strA.length())
+ : Math.abs(SIX - strB.length());
+ }
+
+ // Cleans up a name: 1. Uppercases everything 2. Removes punctuation (not
+ // comprehensive, just the most common ones)
+ // 3. Removes accents 4. Removes any spaces
+ private static String CleanName(String name) {
+ String upperName = name.toUpperCase(Locale.ENGLISH);
+
+ String[] charsToTrim = {"\\-", "[&]", "\\'", "\\.", "[\\,]"};
+ for (String str : charsToTrim) {
+ upperName = upperName.replaceAll(str, "");
+ }
+
+ upperName = RemoveAccents(upperName);
+ upperName = upperName.replaceAll("\\s+", "");
+
+ return upperName;
+ }
+
+ // remove accentued from a string and replace with ascii equivalent cf.
+ // http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
+ public static String RemoveAccents(String s) {
+ if (s == null) return null;
+
+ StringBuilder sb = new StringBuilder();
+ int n = s.length();
+
+ for (int i = 0; i < n; i++) {
+ char c = s.charAt(i);
+ int pos = UNICODE.indexOf(c);
+ if (pos > -1) {
+ sb.append(PLAIN_ASCII.charAt(pos));
+ } else {
+ sb.append(c);
+ }
+ }
+ return sb.toString();
+ }
+
+ private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9"
+ + "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD"
+ + "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177"
+ + "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1"
+ + "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF"
+ + "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
+
+ private static final String PLAIN_ASCII = "AaEeIiOoUu" // grave
+ + "AaEeIiOoUuYy" // acute
+ + "AaEeIiOoUuYy" // circumflex
+ + "AaOoNn" // tilde
+ + "AaEeIiOoUuYy" // umlaut
+ + "Aa" // ring
+ + "Cc" // cedilla
+ + "OoUu" // double acute
+ ;
+
+ /**
+ * Encodes an Object using the Match Rating Approach algorithm. This method is
+ * provided in order to satisfy the requirements of the Encoder interface, and
+ * will throw an EncoderException if the supplied object is not of type
+ * java.lang.String.
+ *
+ * @param pObject
+ * Object to encode
+ * @return An object (or type java.lang.String) containing the Match Rating
+ * Approach code which corresponds to the String supplied.
+ * @throws EncoderException
+ * if the parameter supplied is not of type java.lang.String
+ */
+ public Object encode(Object pObject) throws EncoderException {
+ if (!(pObject instanceof String)) {
+ throw new EncoderException(
+ "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
+ }
+ return GetMRA((String) pObject);
+ }
+
+ /**
+ * Encodes a String using the Match Rating Approach (MRA) algorithm.
+ *
+ * @param pString
+ * String object to encode
+ * @return The MRA code corresponding to the String supplied
+ */
+ public String encode(String pString) {
+ return GetMRA(pString);
+ }
+
+}
Property changes on: lucene\analysis\phonetic\src\java\org\apache\lucene\analysis\phonetic\MatchRatingApproach.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/CHANGES.txt
===================================================================
--- lucene/CHANGES.txt (revision 1401241)
+++ lucene/CHANGES.txt (working copy)
@@ -20,6 +20,9 @@
New Features
+* LUCENE-4494: New phonetic algorithm Match Rating Approach (MRA)
+ and associated tests. (Colm Rice)
+
* LUCENE-4226: New experimental StoredFieldsFormat (in lucene/codecs) that
compresses chunks of documents together in order to improve the compression
ratio. (Adrien Grand)