Index: lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/MatchRatingApproachTests.java =================================================================== --- lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/MatchRatingApproachTests.java (revision 0) +++ lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/MatchRatingApproachTests.java (revision 0) @@ -0,0 +1,913 @@ +package org.apache.lucene.analysis.phonetic; + +import static org.junit.Assert.*; +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Series of tests for the Match Rating Approach algorithm + * + * General naming nomeneclature for the test is of the form: + * GeneralMetadataOnTheTestArea_ActualTestValues_ExpectedResult + * + * An unusual value is indicated by the term "corner case" + */ +public class MatchRatingApproachTests { + + + @Test + public void MRA_Test_AccentRemoval_AllLower_SuccessfullyRemoved() + { + //Arrange + String testInput = "áéíóú"; + + //Assert + String actualRes = MatchRatingApproach.RemoveAccents(testInput); + String expectedRes = "aeiou"; + + //Act + assertTrue(expectedRes.equals(actualRes)); + } + + + @Test + public void MRA_Test_AccentRemoval_WithSpaces_SuccessfullyRemovedAndSpacesInvariant() + { + //Arrange + String testInput = "áé íó ú"; + + //Assert + String actualRes = MatchRatingApproach.RemoveAccents(testInput); + String expectedRes = "ae io u"; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + + @Test + public void MRA_Test_AccentRemoval_UpperandLower_SuccessfullyRemovedAndCaseInvariant() + { + //Arrange + String testInput = "ÁeíÓuu"; + + //Assert + String actualRes = MatchRatingApproach.RemoveAccents(testInput); + String expectedRes = "AeiOuu"; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + @Test + public void MRA_Test_AccentRemoval_MixedWithUnusualChars_SuccessfullyRemovedAndUnusualcharactersInvariant() + { + //Arrange + String testInput = "Á-e'í.,ó&ú"; + + //Assert + String actualRes = MatchRatingApproach.RemoveAccents(testInput); + String expectedRes = "A-e'i.,o&u"; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + + @Test + public void MRA_Test_AccentRemoval_GerSpanFrenMix_SuccessfullyRemoved() + { + //Arrange + //alt plus 132 ä (0228) + //alt plus 137 ë (0235) + //alt plus 148 ö (0246) + //alt plus 225 ß (0223) + //alt plus 129 ü (0252) + //alt plus 142 Ä (0196) + //alt plus Ë (0203) + //alt plus 153 Ö (0214) + //alt plus 154 Ü (0220) + //alt plus 164 ñ + //alp plus 165 Ñ + //alp plus 0224 à + String testInput = "äëöüßÄËÖÜñÑà"; + + //Assert + String actualRes = MatchRatingApproach.RemoveAccents(testInput); + String expectedRes = "aeoußAEOUnNa"; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + + @Test + public void MRA_AccentRemovalNormalString_NoChange() + { + //Arrange + String testInput = "Colorless green ideas sleep furiously"; + + //Assert + String actualRes = MatchRatingApproach.RemoveAccents(testInput); + String expectedRes = "Colorless green ideas sleep furiously"; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + + @Test + public void MRA_AccentRemoval_NINO_NoChange() + { + //Arrange + String testInput = ""; + + //Assert + String actualRes = MatchRatingApproach.RemoveAccents(testInput); + String expectedRes = ""; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + + //***** End Region - Test Accent Removal + + + //***** Begin Region - Test Get Encoding + + @Test + public void MRA_GetEncoding_HARPER_HRPR() { + + //Arrange + String testInput = "HARPER"; + + //Assert + String actualRes = MatchRatingApproach.GetMRA(testInput); + String expectedRes = "HRPR"; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + + @Test + public void MRA_GetEncoding_SMITH_to_SMTH() + { + //Arrange + String testInput = "Smith"; + + //Assert + String actualRes = MatchRatingApproach.GetMRA(testInput); + String expectedRes = "SMTH"; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + @Test + public void MRA_GetEncoding_SMYTH_to_SMYTH() + { + //Arrange + String testInput = "Smyth"; + + //Assert + String actualRes = MatchRatingApproach.GetMRA(testInput); + String expectedRes = "SMYTH"; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + @Test + public void MRA_GetEncoding_Space_to_Nothing() + { + //Arrange + String testInput = " "; + + //Assert + String actualRes = MatchRatingApproach.GetMRA(testInput); + String expectedRes = ""; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + @Test + public void MRA_GetEncoding_NoSpace_to_Nothing() + { + //Arrange + String testInput = ""; + + //Assert + String actualRes = MatchRatingApproach.GetMRA(testInput); + String expectedRes = ""; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + @Test + public void MRA_GetEncoding_Null_to_Nothing() + { + //Arrange + String testInput = null; + + //Assert + String actualRes = MatchRatingApproach.GetMRA(testInput); + String expectedRes = ""; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + @Test + public void MRA_GetEncoding_One_Letter_to_Nothing() + { + //Arrange + String testInput = "E"; + + //Assert + String actualRes = MatchRatingApproach.GetMRA(testInput); + String expectedRes = ""; + + //Act + assertTrue(expectedRes.equals(actualRes)); + + } + + + + @Test + public void MRA_Compare_SMITH_SMYTH_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Smith"; + String testInput2 = "smyth"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_BURNS_BOURNE_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Burns"; + String testInput2 = "Bourne"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_ShortNames_AL_ED_WorksButNoMatch() + { + //Arrange + String testInput1 = "Al"; + String testInput2 = "Ed"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertFalse(actualRes); + + } + + + @Test + public void MRA_Compare_CATHERINE_KATHRYN_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Catherine"; + String testInput2 = "Kathryn"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + @Test + public void MRA_Compare_BRIAN_BRYAN_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Brian"; + String testInput2 = "Bryan"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + @Test + public void MRA_Compare_SEAN_SHAUN_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Séan"; + String testInput2 = "Shaun"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + @Test + public void MRA_Compare_COLM_COLIN_WithAccentsAndSymbolsAndSpaces_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Cólm. "; + String testInput2 = "C-olín"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_STEPHEN_STEVEN_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Stephen"; + String testInput2 = "Steven"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_STEVEN_STEFAN_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Steven"; + String testInput2 = "Stefan"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + + @Test + public void MRA_Compare_STEPHEN_STEFAN_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Stephen"; + String testInput2 = "Stefan"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_SAM_SAMUEL_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Sam"; + String testInput2 = "Samuel"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_MICKY_MICHAEL_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Micky"; + String testInput2 = "Michael"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_OONA_OONAGH_SuccessfulloyMatched() + { + //Arrange + String testInput1 = "Oona"; + String testInput2 = "Oonagh"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_SOPHIE_SOFIA_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Sophie"; + String testInput2 = "Sofia"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + + @Test + public void MRA_Compare_FRANCISZEK_FRANCES_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Franciszek"; + String testInput2 = "Frances"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + @Test + public void MRA_Compare_TOMASZ_TOM_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Tomasz"; + String testInput2 = "tom"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + @Test + public void MRA_Compare_SmallInput_CARK_Kl_SuccessfullyMatched() + { + //Arrange + String testInput2 = "Kl"; + String testInput1 = "Karl"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + @Test + public void MRA_CompareNameToSingleLetter_KARL_C_DoesNotMatch() + { + //Arrange + String testInput2 = "Karl"; + String testInput1 = "C"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertFalse(actualRes); + + } + + @Test + public void MRA_Compare_ZACH_ZAKARIA_SuccessfullyMatched() + { + //Arrange + String testInput2 = "Zach"; + String testInput1 = "Zakaria"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_KARL_ALESSANDRO_DoesNotMatch() + { + //Arrange + String testInput1 = "Karl"; + String testInput2 = "Alessandro"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertFalse(actualRes); + + } + + + @Test + public void MRA_Compare_Forenames_UNA_OONAGH_ShouldSuccessfullyMatchButDoesNot() + { + //Arrange + String testInput1 = "Úna"; + String testInput2 = "Oonagh"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertFalse(actualRes); //To make test pass. Disappointing though. + + } + + + + //***** Begin Region - Test Get Encoding - Surnames + + @Test + public void MRA_Compare_Surname_OSULLIVAN_OSUILLEABHAIN_SuccessfulMatch() + { + //Arrange + String testInput1 = "O'Sullivan"; + String testInput2 = "Ó ' Súilleabháin"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + @Test + public void MRA_Compare_LongSurnames_MORIARTY_OMUIRCHEARTAIGH_DoesNotSuccessfulMatch() + { + //Arrange + String testInput1 = "Moriarty"; + String testInput2 = "OMuircheartaigh"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertFalse(actualRes); + + } + + + @Test + public void MRA_Compare_LongSurnames_OMUIRCHEARTAIGH_OMIREADHAIGH_SuccessfulMatch() + { + //Arrange + String testInput1 = "o'muireadhaigh"; + String testInput2 = "Ó 'Muircheartaigh "; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + @Test + public void MRA_Compare_Surname_COOPERFLYNN_SUPERLYN_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Cooper-Flynn"; + String testInput2 = "Super-Lyn"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + @Test + public void MRA_Compare_Surname_HAILEY_HALLEY_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Hailey"; + String testInput2 = "Halley"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + // **** BEGIN YIDDISH/SLAVIC SECTION **** + + @Test + public void MRA_Compare_Surname_AUERBACH_UHRBACH_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Auerbach"; + String testInput2 = "Uhrbach"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + } + + + @Test + public void MRA_Compare_Surname_MOSKOWITZ_MOSKOVITZ_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Moskowitz"; + String testInput2 = "Moskovitz"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_Surname_LIPSHITZ_LIPPSZYC_SuccessfullyMatched() + { + //Arrange + String testInput1 = "LIPSHITZ"; + String testInput2 = "LIPPSZYC"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_Surname_LEWINSKY_LEVINSKI_SuccessfullyMatched() + { + //Arrange + String testInput1 = "LEWINSKY"; + String testInput2 = "LEVINSKI"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_Surname_SZLAMAWICZ_SHLAMOVITZ_SuccessfullyMatched() + { + //Arrange + String testInput1 = "SZLAMAWICZ"; + String testInput2 = "SHLAMOVITZ "; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_Surname_ROSOCHOWACIEC_ROSOKHOVATSETS_SuccessfullyMatched() + { + //Arrange + String testInput1 = "R o s o ch o w a c ie c"; + String testInput2 = " R o s o k ho v a ts e ts"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_Surname_PRZEMYSL_PSHEMESHIL_SuccessfullyMatched() + { + //Arrange + String testInput1 = " P rz e m y s l"; + String testInput2 = " P sh e m e sh i l"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + // **** END YIDDISH/SLAVIC SECTION **** + + @Test + public void MRA_Compare_PETERSON_PETERS_SuccessfullyMatched() + { + //Arrange + String testInput1 = "Peterson"; + String testInput2 = "Peters"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_MCGOWAN_MCGEOGHEGAN_SuccessfullyMatched() + { + //Arrange + String testInput1 = "McGowan"; + String testInput2 = "Mc Geoghegan"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + @Test + public void MRA_Compare_SurnamesCornerCase_MURPHY_Space_NoMatch() + { + //Arrange + String testInput1 = "Murphy"; + String testInput2 = " "; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertFalse(actualRes); + + } + + @Test + public void MRA_Compare_SurnamesCornerCase_MURPHY_NoSpace_NoMatch() + { + //Arrange + String testInput1 = "Murphy"; + String testInput2 = ""; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertFalse(actualRes); + + } + + @Test + public void MRA_Compare_SurnameCornerCase_Nulls_NoMatch() + { + //Arrange + String testInput1 = null; + String testInput2 = null; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertFalse(actualRes); + + } + + + + @Test + public void MRA_Compare_Surnames_MURPHY_LYNCH_NoMatchExpected() + { + //Arrange + String testInput1 = "Murphy"; + String testInput2 = "Lynch"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertFalse(actualRes); + + } + + @Test + public void MRA_Compare_Forenames_SEAN_JOHN_MatchExpected() + { + //Arrange + String testInput1 = "Sean"; + String testInput2 = "John"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertTrue(actualRes); + + } + + + @Test + public void MRA_Compare_Forenames_SEAN_PETE_NoMatchExpected() + { + //Arrange + String testInput1 = "Sean"; + String testInput2 = "Pete"; + + //Assert + boolean actualRes = MatchRatingApproach.CompareMRA(testInput1, testInput2); + + //Act + assertFalse(actualRes); + + } + +} Property changes on: lucene\analysis\phonetic\src\test\org\apache\lucene\analysis\phonetic\MatchRatingApproachTests.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/MatchRatingApproach.java =================================================================== --- lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/MatchRatingApproach.java (revision 0) +++ lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/MatchRatingApproach.java (revision 0) @@ -0,0 +1,379 @@ +package org.apache.lucene.analysis.phonetic; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//import org.apache.commons.codec.Encoder; +import java.util.Locale; + +import org.apache.commons.codec.EncoderException; +import org.apache.commons.codec.StringEncoder; + +/** + * Match Rating Approach Phonetic Algorithm Developed by Western + * Airlines in 1977 + * + * Colm Rice (colm_rice at hotmail dot com) + * @see Wikipedia - + * Match Rating Approach + */ +public class MatchRatingApproach implements StringEncoder { + + /* + * Variable used in the Match Rating Approach Algorithm + */ + private static int SIX = 6; + + /** + * Creates an instance of the Match Rating Approach encoder + */ + public MatchRatingApproach() { + super(); + } + + /** + * Obtains the Match Rating Approach (MRA) encoding of a String. This + * algorithm is broadly similar to soundex but better. Input is upper-cased + * and cleaned before main processing begins. By cleaned what happens is: a). + * Punctuation removed (not everything but the most common ones) b). Any + * numbers remove c). Accents removed. d). Remove any spaces + * + * Limitations: Input format is expected to be a single ASCII word (i.e. a + * name) with at least 2 characters in the range [AZ], no numbers. + * + * Improvements: Punctuation removal is just the most common symbols. I guess + * this could be improved. + * + * Known Mismatches: Did not match the Irish name: Úna with the Welsh + * equivalent: Oonagh. Disappointing but no show stopper. + * + * @param name + * String to obtain the MRA encoding for + * @return An MRA code corresponding to the String supplied + */ + public static String GetMRA(String name) { + // Bulletproof for trivial input - NINO + if (name == null || "".equalsIgnoreCase(name) || " ".equalsIgnoreCase(name) + || name.length() == 1) return ""; + + // Preprocessing + name = CleanName(name); + + // BEGIN: Actual encoding part of the algorithm... + // 1. Delete all vowels unless the vowel begins the word + name = RemoveVowels(name); + + // 2. Remove second consonant from any double consonant + name = RemoveDoubleConsonants(name); + + // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters + name = GetFirst3Last3(name); + + return name; + } + + /** + * Determines if 2 names are homophonous via the Match Rating Approach (MRA) + * algorithm + * + * It should be noted that the strings are cleaned in the same way as above + * + * @param name1 + * First of the 2 strings (names) to compare + * @param name2 + * Second of the 2 names to compare + * @return true if the encodings are identical false + * otherwise. + */ + public static boolean CompareMRA(String name1, String name2) { + // Bulletproof for trivial input - NINO + if (name1 == null || "".equalsIgnoreCase(name1) + || " ".equalsIgnoreCase(name1)) return false; + else if (name2 == null || "".equalsIgnoreCase(name2) + || " ".equalsIgnoreCase(name2)) return false; + else if (name1.length() == 1 || name2.length() == 1) return false; + else if (name1.equalsIgnoreCase(name2)) return true; + + // Preprocessing + name1 = CleanName(name1); + name2 = CleanName(name2); + + // Actual MRA Algorithm + + // 1. Remove vowels + name1 = RemoveVowels(name1); + name2 = RemoveVowels(name2); + + // 2. Remove double consonants + name1 = RemoveDoubleConsonants(name1); + name2 = RemoveDoubleConsonants(name2); + + // 3. Reduce down to 3 letters + name1 = GetFirst3Last3(name1); + name2 = GetFirst3Last3(name2); + + // 4. Check for length difference - if 3 or greater then no similarity + // comparison is done + if (Math.abs(name1.length() - name2.length()) >= 3) return false; + + // 5. Obtain the minimum rating value by calculating the length sum of the + // encoded Strings and sending it down. + int sumLength = Math.abs(name1.length() + name2.length()); + int minRating = 0; + minRating = GetMinRating(sumLength); + + // 6. Process the encoded Strings from left to right and remove any + // identical characters found from both Strings respectively. + int count = LeftToRightThenRightToLeftProcessing(name1, name2); + + // 7. Each PNI item that has a similarity rating equal to or greater than + // the min is considered to be a good candidate match + return (count >= minRating) ? true : false; + + } + + // Delete all vowels unless the vowel begins the word + private static String RemoveVowels(String name) { + // Extract first letter + String firstLetter = name.substring(0, 1); + + name = name.replaceAll("A", ""); + name = name.replaceAll("E", ""); + name = name.replaceAll("I", ""); + name = name.replaceAll("O", ""); + name = name.replaceAll("U", ""); + + name = name.replaceAll("\\s{2,}\\b", " "); + + return IsVowel(firstLetter) ? (firstLetter + name) : name; + } + + // Determines if a letter is a vowel. Returns true if this is the case, else + // false + private static boolean IsVowel(String letter) { + // e=12.702%; a=8.167%; o=7.507%; i=6.996%; u=2.758% English letter + // frequency distribution + if (letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") + || letter.equalsIgnoreCase("O") || letter.equalsIgnoreCase("I") + || letter.equalsIgnoreCase("U")) { + return true; + } else { + return false; + } + + } + + // Replace any double consonant pair with the single letter equivalent + private static String RemoveDoubleConsonants(String name) { + name = name.replaceAll("AA", "A"); + name = name.replaceAll("BB", "B"); + name = name.replaceAll("CC", "C"); + name = name.replaceAll("DD", "D"); + name = name.replaceAll("EE", "E"); + name = name.replaceAll("FF", "F"); + name = name.replaceAll("GG", "G"); + name = name.replaceAll("HH", "H"); + name = name.replaceAll("II", "I"); + name = name.replaceAll("JJ", "J"); + name = name.replaceAll("KK", "K"); + name = name.replaceAll("LL", "L"); + name = name.replaceAll("MM", "M"); + name = name.replaceAll("NN", "N"); + name = name.replaceAll("OO", "O"); + name = name.replaceAll("PP", "P"); + name = name.replaceAll("QQ", "Q"); + name = name.replaceAll("RR", "R"); + name = name.replaceAll("SS", "S"); + name = name.replaceAll("TT", "T"); + name = name.replaceAll("UU", "U"); + name = name.replaceAll("VV", "V"); + name = name.replaceAll("WW", "W"); + name = name.replaceAll("XX", "X"); + name = name.replaceAll("YY", "Y"); + name = name.replaceAll("ZZ", "Z"); + + return name; + + } + + // Gets the first 3 and last 3 letters of a name (if greater than 6 + // characters) else returns the name + private static String GetFirst3Last3(String name) { + int nameLength = name.length(); + return (nameLength > 6) ? name.substring(0, 3) + + name.substring(nameLength - 3, nameLength) : name; + + } + + // Obtains the min rating (values from documentation) of the length sum of the + // 2 names. + private static int GetMinRating(int sumLength) { + int minRating = 0; + + if (sumLength <= 4) { + minRating = 5; + } else if ((sumLength >= 5) && (sumLength <= 7)) { + minRating = 4; + } else if ((sumLength >= 8) && (sumLength <= 11)) { + minRating = 3; + } else if (sumLength == 12) { + minRating = 2; + } else { + minRating = 1; // not sure tbh - documentation said little on this. Unlike + // to occur I reckon... + } + + return minRating; + } + + // Processes the names from left to right (first) then right to left removing + // identical letters in same positions + // Then subtracts the longer string that remains from 6 and returns this int + // value. + private static int LeftToRightThenRightToLeftProcessing(String name1, + String name2) { + char[] name1Char = name1.toCharArray(); + char[] name2Char = name2.toCharArray(); + + int name1Size = name1.length() - 1; + int name2Size = name2.length() - 1; + + String name1LtRStart = ""; + String name1LtREnd = ""; + + String name2RtLStart = ""; + String name2RtLEnd = ""; + + for (int i = 0; i < name1Char.length; i++) { + if (i > name2Size) break; + + name1LtRStart = name1.substring(i, i + 1); + name1LtREnd = name1.substring(name1Size - i, (name1Size - i) + 1); + + name2RtLStart = name2.substring(i, i + 1); + name2RtLEnd = name2.substring(name2Size - i, (name2Size - i) + 1); + + // Left to right... + if (name1LtRStart.equals(name2RtLStart)) { + name1Char[i] = ' '; + name2Char[i] = ' '; + } + + // Right to left... + if (name1LtREnd.equals(name2RtLEnd)) { + name1Char[name1Size - i] = ' '; + name2Char[name2Size - i] = ' '; + } + + } + + // Char arrays -> string & remove extraneous space + String strA = new String(name1Char).replaceAll("\\s+", ""); + String strB = new String(name2Char).replaceAll("\\s+", ""); + + // Final bit - subtract longest string from 6 and return this int value + return (strA.length() > strB.length()) ? Math.abs(SIX - strA.length()) + : Math.abs(SIX - strB.length()); + } + + // Cleans up a name: 1. Uppercases everything 2. Removes punctuation (not + // comprehensive, just the most common ones) + // 3. Removes accents 4. Removes any spaces + private static String CleanName(String name) { + String upperName = name.toUpperCase(Locale.ENGLISH); + + String[] charsToTrim = {"\\-", "[&]", "\\'", "\\.", "[\\,]"}; + for (String str : charsToTrim) { + upperName = upperName.replaceAll(str, ""); + } + + upperName = RemoveAccents(upperName); + upperName = upperName.replaceAll("\\s+", ""); + + return upperName; + } + + // remove accentued from a string and replace with ascii equivalent cf. + // http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29 + public static String RemoveAccents(String s) { + if (s == null) return null; + + StringBuilder sb = new StringBuilder(); + int n = s.length(); + + for (int i = 0; i < n; i++) { + char c = s.charAt(i); + int pos = UNICODE.indexOf(c); + if (pos > -1) { + sb.append(PLAIN_ASCII.charAt(pos)); + } else { + sb.append(c); + } + } + return sb.toString(); + } + + private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" + + "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" + + "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" + + "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" + + "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" + + "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171"; + + private static final String PLAIN_ASCII = "AaEeIiOoUu" // grave + + "AaEeIiOoUuYy" // acute + + "AaEeIiOoUuYy" // circumflex + + "AaOoNn" // tilde + + "AaEeIiOoUuYy" // umlaut + + "Aa" // ring + + "Cc" // cedilla + + "OoUu" // double acute + ; + + /** + * Encodes an Object using the Match Rating Approach algorithm. This method is + * provided in order to satisfy the requirements of the Encoder interface, and + * will throw an EncoderException if the supplied object is not of type + * java.lang.String. + * + * @param pObject + * Object to encode + * @return An object (or type java.lang.String) containing the Match Rating + * Approach code which corresponds to the String supplied. + * @throws EncoderException + * if the parameter supplied is not of type java.lang.String + */ + public Object encode(Object pObject) throws EncoderException { + if (!(pObject instanceof String)) { + throw new EncoderException( + "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String"); + } + return GetMRA((String) pObject); + } + + /** + * Encodes a String using the Match Rating Approach (MRA) algorithm. + * + * @param pString + * String object to encode + * @return The MRA code corresponding to the String supplied + */ + public String encode(String pString) { + return GetMRA(pString); + } + +} Property changes on: lucene\analysis\phonetic\src\java\org\apache\lucene\analysis\phonetic\MatchRatingApproach.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1401241) +++ lucene/CHANGES.txt (working copy) @@ -20,6 +20,9 @@ New Features +* LUCENE-4494: New phonetic algorithm Match Rating Approach (MRA) + and associated tests. (Colm Rice) + * LUCENE-4226: New experimental StoredFieldsFormat (in lucene/codecs) that compresses chunks of documents together in order to improve the compression ratio. (Adrien Grand)