Index: contrib/lexicon/lib/commons-lang-2.3.jar =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Property changes on: contrib/lexicon/lib/commons-lang-2.3.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Index: contrib/lexicon/src/test/org/apache/lucene/lexicon/TestLexicon.java =================================================================== --- contrib/lexicon/src/test/org/apache/lucene/lexicon/TestLexicon.java (revision 0) +++ contrib/lexicon/src/test/org/apache/lucene/lexicon/TestLexicon.java (revision 0) @@ -0,0 +1,129 @@ +package org.apache.lucene.lexicon; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.aphone.AphoneEn; +import org.apache.lucene.lexicon.analyzer.AphoneAnalyser; +import org.apache.lucene.lexicon.analyzer.NGramAnalyzer; +import org.apache.lucene.lexicon.analyzer.SimilarAble; +import org.apache.lucene.lexicon.analyzer.StemmerAnalyzer; +import org.apache.lucene.lexicon.reader.DirectoryReader; +import org.apache.lucene.lexicon.reader.LexiconReader; +import org.apache.lucene.lexicon.reader.SimpleReader; +import org.apache.lucene.lexicon.reader.Word; +import org.apache.lucene.store.RAMDirectory; + +/** + * + * @Author Mathieu Lecarme + */ +public class TestLexicon extends TestCase { + private String[] words = new String[]{ + "Lucene", + "Apache", + "lexicon" + }; + + private Lexicon buildLexicon() throws IOException { + //return new Lexicon(FSDirectory.getDirectory("/tmp/index", false)); + return new Lexicon(new RAMDirectory()); + } + + private void fillExamples(Lexicon lexicon) throws IOException { + LexiconReader reader = new SimpleReader(words); + reader.setTokenFilter(new LowerCaseFilter(reader.getStream())); + lexicon.read(reader); + } + + public void testAphone () throws IOException { + Lexicon lexicon = buildLexicon(); + AphoneAnalyser analyzer = new AphoneAnalyser(new AphoneEn()); + lexicon.addAnalyser(analyzer); + fillExamples(lexicon); + Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("Lussene is an appache project")); + SimilarTokenFilter filter = new SimilarTokenFilter(tokenizer, lexicon, null, new SimilarAble[] {analyzer}); + for(;;){ + Token token = filter.next(); + if(token == null) + break; + System.out.println(token.termText()); + } + } + + public void testStemmer() throws IOException { + Lexicon lexicon = buildLexicon(); + StemmerAnalyzer analyzer = new StemmerAnalyzer("French"); + String[] words2 = new String[] {"cheval", "bataille", "carotte"}; + LexiconReader reader = new SimpleReader(words2); + reader.setTokenFilter(new LowerCaseFilter(reader.getStream())); + lexicon.addAnalyser(analyzer); + lexicon.read(reader); + String[] similars = lexicon.findSimilar("chevaux"); + assertEquals(1, similars.length); + similars = lexicon.findSimilar("bateaux"); + assertEquals(0, similars.length); + similars = lexicon.findSimilar("carote"); + assertEquals(0, similars.length); + } + + public void testSimilar() throws IOException { + Lexicon lexicon = buildLexicon(); + AphoneAnalyser analyzer = new AphoneAnalyser(new AphoneEn()); + lexicon.addAnalyser(analyzer); + fillExamples(lexicon); + String[] similars = lexicon.findSimilar("appache", null, new SimilarAble[] { analyzer}); + assertEquals(1, similars.length); + similars = lexicon.findSimilar("appache", new String[]{"foo"}, new SimilarAble[] { analyzer}); + assertEquals(0, similars.length); + similars = lexicon.findSimilar("appache", new String[]{Word.SIMPLE}, new SimilarAble[] { analyzer}); + assertEquals(1, similars.length); + similars = lexicon.findSimilar("appache", null, null); + assertEquals(1, similars.length); + similars = lexicon.findSimilar("toto", null, new SimilarAble[] { analyzer}); + assertEquals(0, similars.length); + } + + public void testNear() throws IOException { + Lexicon lexicon = buildLexicon(); + NGramAnalyzer analyzer = new NGramAnalyzer(); + lexicon.addAnalyser(analyzer); + fillExamples(lexicon); + Suggestions nears = lexicon.findNear("apoche"); + //System.out.println(nears); + assertEquals(1, nears.size()); + assertEquals("apache", nears.getWordIterator().next()); + } + + public void testDirectory() throws IOException{ + LexiconReader lexiconReader = new DirectoryReader(TestDirectoryReader.buildDirectory()); + Lexicon lexicon = new Lexicon(new RAMDirectory()); + lexicon.addAnalyser(new NGramAnalyzer()); + lexicon.read(lexiconReader); + assertEquals(9, lexicon.getReader().numDocs()); + assertEquals("lazy", lexicon.findNear("layz").getWordIterator().next()); + } +} Index: contrib/lexicon/src/test/org/apache/lucene/lexicon/analyzer/TestAnalyzer.java =================================================================== --- contrib/lexicon/src/test/org/apache/lucene/lexicon/analyzer/TestAnalyzer.java (revision 0) +++ contrib/lexicon/src/test/org/apache/lucene/lexicon/analyzer/TestAnalyzer.java (revision 0) @@ -0,0 +1,56 @@ +/** + * + */ +package org.apache.lucene.lexicon.analyzer; + +import junit.framework.TestCase; + +import org.apache.lucene.aphone.Aphone; +import org.apache.lucene.aphone.AphoneFr; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.Term; + +/** + * @author mlecarme + * + */ +public class TestAnalyzer extends TestCase{ + + private void runLexiconAnalyzer(LexiconAnalyzer analyzer) { + Term term = new Term("test", "lucene"); + Field[] fields = analyzer.getFields(term); + for(int a=0; a < fields.length; a++) { + //System.out.println(fields[a]); + } + } + + private void runSimilar(String word, SimilarAble analyzer, String[] similars) { + for(int a =0; a < similars.length; a++) + assertEquals(analyzer.simplify(similars[a]), analyzer.simplify(word)); + } + + public void testNgram() { + LexiconAnalyzer ngram = new NGramAnalyzer(); + runLexiconAnalyzer(ngram); + } + + public void testFrenchAphone() { + Aphone aphone = new AphoneFr(); + AphoneAnalyser analyzer = new AphoneAnalyser(aphone); + runLexiconAnalyzer(analyzer); + runSimilar("lucene", analyzer, new String[]{"lucenes", "lussene", "lhucene"}); + } + + public void testStem() { + StemmerAnalyzer analyzer = new StemmerAnalyzer("French"); + runLexiconAnalyzer(analyzer); + runSimilar("lucene", analyzer, new String[]{"lucenne", "lucenes"}); + } + + public void testAnagram() { + AnagramAnalyzer analyzer = new AnagramAnalyzer(); + runLexiconAnalyzer(analyzer); + runSimilar("lucene", analyzer, new String[]{"lucenne", "nucle", "lucne"}); + } + +} Index: contrib/lexicon/src/test/org/apache/lucene/lexicon/TestQueryUtils.java =================================================================== --- contrib/lexicon/src/test/org/apache/lucene/lexicon/TestQueryUtils.java (revision 0) +++ contrib/lexicon/src/test/org/apache/lucene/lexicon/TestQueryUtils.java (revision 0) @@ -0,0 +1,44 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.util.Set; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.Query; + +/** + * @author Mathieu Lecarme + * + */ +public class TestQueryUtils extends TestCase { + public void testExtractTerms() throws ParseException { + QueryParser parser = new QueryParser("txt", new WhitespaceAnalyzer()); + Query query = parser.parse("lorem k:ipsum OR (dolores +nic) consequat*"); + Set terms = QueryUtils.extractTerms(query); + assertTrue(terms.remove(new Term("txt", "lorem"))); + assertTrue(terms.remove(new Term("k", "ipsum"))); + assertTrue(terms.remove(new Term("txt", "dolores"))); + assertTrue(terms.remove(new Term("txt", "nic"))); + assertFalse(terms.remove(new Term("txt", "consequat"))); + assertTrue(terms.isEmpty()); + } + + public void testFilter() throws ParseException { + QueryParser parser = new QueryParser("txt", new WhitespaceAnalyzer()); + Query query = parser.parse("lorem k:ipsum OR (dolores +nic) consequat*"); + AbstractTextTermQueryFilter filter = new AbstractTextTermQueryFilter() { + public String filter(String txt) { + return txt.toUpperCase(); + } + }; + Query filtered = QueryUtils.filter(query, filter); + System.out.println(filtered.toString()); + } +} Index: contrib/lexicon/src/test/org/apache/lucene/lexicon/TestDirectoryReader.java =================================================================== --- contrib/lexicon/src/test/org/apache/lucene/lexicon/TestDirectoryReader.java (revision 0) +++ contrib/lexicon/src/test/org/apache/lucene/lexicon/TestDirectoryReader.java (revision 0) @@ -0,0 +1,50 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.io.IOException; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.StopAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.lexicon.reader.DirectoryReader; +import org.apache.lucene.lexicon.reader.LexiconReader; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +/** + * @author Mathieu Lecarme + * + */ +public class TestDirectoryReader extends TestCase { + protected static Directory buildDirectory() throws IOException { + Directory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new StopAnalyzer(), true); + Document document = new Document(); + document.add(new Field("name", new StringReader("Robert Dupond"))); + document.add(new Field("bio", new StringReader("The quick brown fox jumps over the lazy dog"))); + writer.addDocument(document); + document = new Document(); + document.add(new Field("name", new StringReader("John Doe"))); + document.add(new Field("bio", new StringReader("The name John Doe is generally used as a placeholder name for a male party in a legal action or legal discussion whose true identity is unknown."))); + writer.close(); + return directory; + } + + public void testDirectory() throws IOException { + LexiconReader lexiconReader = new DirectoryReader(buildDirectory()); + int cpt = 0; + for(;;) { + if(lexiconReader.next() == null) + break; + cpt++; + } + assertEquals(9, cpt); + } + +} Index: contrib/lexicon/src/test/org/apache/lucene/lexicon/TestDummyStream.java =================================================================== --- contrib/lexicon/src/test/org/apache/lucene/lexicon/TestDummyStream.java (revision 0) +++ contrib/lexicon/src/test/org/apache/lucene/lexicon/TestDummyStream.java (revision 0) @@ -0,0 +1,52 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; + +/** + * @author mlecarme + * + */ +public class TestDummyStream extends TestCase { + + public void testStack() throws IOException { + List list = new ArrayList(); + list.add("Pim"); + list.add("Pam"); + list.add("Poum"); + DummyStream dummyStream = new DummyStream(); + for(int a = 0; a < list.size(); a++) + dummyStream.add((String)list.get(a)); + int cpt = 0; + for(;;){ + Token token = dummyStream.next(); + if(token == null) + break; + assertEquals(token.termText(), list.get(cpt)); + cpt ++; + } + assertTrue(dummyStream.isEmpty()); + dummyStream.addAll(list); + cpt = 0; + for(;;){ + Token token = dummyStream.next(); + if(token == null) + break; + assertEquals(token.termText(), list.get(cpt)); + cpt ++; + } + dummyStream = new DummyStream(list); + assertEquals(list.size(), dummyStream.size()); + dummyStream = new DummyStream("plop"); + assertEquals(1, dummyStream.size()); + } + +} Index: contrib/lexicon/src/test/org/apache/lucene/lexicon/TestSuggestions.java =================================================================== --- contrib/lexicon/src/test/org/apache/lucene/lexicon/TestSuggestions.java (revision 0) +++ contrib/lexicon/src/test/org/apache/lucene/lexicon/TestSuggestions.java (revision 0) @@ -0,0 +1,70 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.util.Arrays; +import java.util.Set; +import java.util.TreeSet; + +import junit.framework.TestCase; + +/** + * @author mlecarme + * + */ +public class TestSuggestions extends TestCase { + public void testDistanceMax() { + Suggestions suggestions = new Suggestions(); + suggestions.setDistanceMax(2); + suggestions.setSizeMax(3); + suggestions.add(new Suggestive("toto", new Integer(1), null, null )); + assertEquals(1, suggestions.size()); + suggestions.add(new Suggestive("toto", new Integer(3), null, null )); + assertEquals(1, suggestions.size()); + } + + public void testSet(){ + Set set = new TreeSet(); + Suggestive[] suggestives = new Suggestive[]{ + new Suggestive("Pim", new Integer(1), new Float(42), new Integer(1) ), + new Suggestive("Pam", new Integer(1), new Float(43), new Integer(1) ), + new Suggestive("Poum", new Integer(1), new Float(30), new Integer(1) ), + new Suggestive("The Captain", new Integer(1), new Float(51), new Integer(1) ) + }; + set.addAll(Arrays.asList(suggestives)); + assertEquals(4, set.size()); + } + + public void testAdd() { + Suggestions suggestions = new Suggestions(); + suggestions.setDistanceMax(2); + suggestions.setSizeMax(3); + Suggestive s1 = new Suggestive("toto", new Integer(1), new Float(42), new Integer(1) ); + Suggestive s2 = new Suggestive("toto2", new Integer(1), new Float(42), new Integer(1) ); + assertNotSame(s1, s2); + assertNotSame(new Integer(s1.hashCode()), new Integer(s2.hashCode())); + + Set set = new TreeSet(); + set.add(s1); + set.add(s2); + assertEquals(2, set.size()); + suggestions.add(s1); + suggestions.add(s2); + assertEquals(2, suggestions.size()); + } + + public void testSizeMax() { + Suggestions suggestions = new Suggestions(); + suggestions.setDistanceMax(2); + suggestions.setSizeMax(3); + Suggestive[] suggestives = new Suggestive[]{ + new Suggestive("Pim", new Integer(1), new Float(42), new Integer(1) ), + new Suggestive("Pam", new Integer(1), new Float(43), new Integer(1) ), + new Suggestive("Poum", new Integer(1), new Float(30), new Integer(1) ), + new Suggestive("The Captain", new Integer(1), new Float(51), new Integer(1) ) + }; + suggestions.addAll(Arrays.asList(suggestives)); + assertEquals(3, suggestions.size()); + } +} Index: contrib/lexicon/src/test/org/apache/lucene/lexicon/TestSearcher.java =================================================================== --- contrib/lexicon/src/test/org/apache/lucene/lexicon/TestSearcher.java (revision 0) +++ contrib/lexicon/src/test/org/apache/lucene/lexicon/TestSearcher.java (revision 0) @@ -0,0 +1,39 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.lexicon.analyzer.NGramAnalyzer; +import org.apache.lucene.lexicon.reader.DirectoryReader; +import org.apache.lucene.lexicon.reader.LexiconReader; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +/** + * @author mlecarme + * + */ +public class TestSearcher extends TestCase { + public void testSuggestedQuery() throws IOException, ParseException { + Directory directory = TestDirectoryReader.buildDirectory(); + LexiconReader lexiconReader = new DirectoryReader(directory); + Lexicon lexicon = new Lexicon(new RAMDirectory()); + lexicon.addAnalyser(new NGramAnalyzer()); + lexicon.read(lexiconReader); + QueryParser parser = new QueryParser("txt", new WhitespaceAnalyzer()); + Query query = parser.parse("bio:brawn"); + SuggestiveSearcher searcher = new SuggestiveSearcher(new IndexSearcher(directory), lexicon); + SuggestiveHits hits = searcher.searchWithSuggestions(query); + System.out.println(hits.getSuggestedQuery()); + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/AbstractSearcher.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/AbstractSearcher.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/AbstractSearcher.java (revision 0) @@ -0,0 +1,74 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.io.IOException; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.HitCollector; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopFieldDocs; +import org.apache.lucene.search.Weight; + +/** + * @author mlecarme + * + */ +public abstract class AbstractSearcher extends Searcher{ + protected Searcher searcher; + protected Lexicon lexicon; + + public void close() throws IOException { + searcher.close(); + } + + public Document doc(int i) throws CorruptIndexException, IOException { + return searcher.doc(i); + } + + public int docFreq(Term term) throws IOException { + return searcher.docFreq(term); + } + + public Explanation explain(Weight weight, int doc) throws IOException { + return searcher.explain(weight, doc); + } + + public int maxDoc() throws IOException { + return searcher.maxDoc(); + } + + public Query rewrite(Query query) throws IOException { + return searcher.rewrite(query); + } + + public void search(Weight weight, Filter filter, HitCollector results) + throws IOException { + searcher.search(weight, filter, results); + } + + public TopDocs search(Weight weight, Filter filter, int n) + throws IOException { + return search(weight, filter, n); + } + + public TopFieldDocs search(Weight weight, Filter filter, int n, Sort sort) + throws IOException { + return searcher.search(weight, filter, n, sort); + } + + public Document doc(int n, FieldSelector fieldSelector) + throws CorruptIndexException, IOException { + return searcher.doc(n, fieldSelector); + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/NotIndexedException.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/NotIndexedException.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/NotIndexedException.java (revision 0) @@ -0,0 +1,29 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.io.IOException; + +import org.apache.lucene.lexicon.analyzer.LexiconAnalyzer; + +/** + * This Analyzer was not uesd during indexation + * @author Mathieu Lecarme + * + */ +public class NotIndexedException extends IOException{ + private static final long serialVersionUID = -4347569712585625159L; + + public NotIndexedException() { + super(); + } + + public NotIndexedException(String s) { + super(s); + } + + public NotIndexedException(LexiconAnalyzer analyzer) { + super(analyzer.toString()); + } +} \ No newline at end of file Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/Lexicon.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/Lexicon.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/Lexicon.java (revision 0) @@ -0,0 +1,322 @@ +package org.apache.lucene.lexicon; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedHashSet; + +import org.apache.commons.lang.StringUtils; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.lexicon.analyzer.Distanceable; +import org.apache.lucene.lexicon.analyzer.LexiconAnalyzer; +import org.apache.lucene.lexicon.analyzer.NearAble; +import org.apache.lucene.lexicon.analyzer.SimilarAble; +import org.apache.lucene.lexicon.reader.LexiconReader; +import org.apache.lucene.lexicon.reader.Word; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; + +/** + * A list of word + *
+ * [TODO] vérification de l'existence d'un mot dans son index source, => lazy delete
+ * [TODO] indexation du nombre d'occurrence
+ * [TODO] mis à jour des index (mots en plus, nb d'occurences)
+ * 
+ *

+ * A Lexicon uses a Lucene Directory. + *

+ * Lexicon lexicon = new new Lexicon(new RAMDirectory());
+ * 
+ * Lexicon can be feed from a Lucene index, or in simpler way, from a list of word, or a plain text file. + *
+ * String[] words = new String[]{"Lucene", "Apache", "lexicon"};
+ * LexiconReader reader = new SimpleReader(words);
+ * 
+ * Lucene TokenFilter can be used. + *
+ * reader.setTokenFilter(new LowerCaseFilter(reader.getStream()));
+ * lexicon.read(reader);
+ * 
+ * @author Mathieu Lecarme + * @see {http://en.wikipedia.org/wiki/Lexicon} + */ +public class Lexicon { + public static final String WORD = "word"; + public static final String FIELD = "field"; + public static final String POPULARITY = "pop"; + private Integer batchSize = new Integer(1000); + private IndexReader reader; + private IndexSearcher searcher = null; + private Directory spellIndex; + private LinkedHashSet analysers = new LinkedHashSet(); + + /** + * Build a Lexicon with a Directory + * @param spellIndex + * @throws IOException + * @throws LockObtainFailedException + * @throws CorruptIndexException + */ + public Lexicon(Directory spellIndex) throws CorruptIndexException, LockObtainFailedException, IOException { + this.spellIndex = spellIndex; + if (!IndexReader.indexExists(spellIndex)) { + IndexWriter writer = new IndexWriter(spellIndex, null, true); + writer.close(); + } + // close the old searcher, if there was one + if (searcher != null) { + this.searcher.close(); + } + this.searcher = new IndexSearcher(this.spellIndex); + this.reader = this.searcher.getIndexReader(); + } + + public IndexSearcher getSearcher(){ + return searcher; + } + + public IndexReader getReader() throws CorruptIndexException, IOException{ + if (reader == null) { + reader = IndexReader.open(spellIndex); + } + return reader; + } + + /** + * Read a LexiconReader to add some Word + * @param lexiconReader + * @throws IOException + */ + public void read(LexiconReader lexiconReader) throws IOException { + IndexReader.unlock(spellIndex); + IndexWriter writer = new IndexWriter(spellIndex, new WhitespaceAnalyzer(), + !IndexReader.indexExists(spellIndex)); + writer.setMergeFactor(300); + writer.setMaxBufferedDocs(150); + Word word; + for(;;){ + word = lexiconReader.filteredNext(); + if(word == null) + break; + //[FIXME] réutiliser le document existant pour empiler les fields + Document document = new Document(); + document.add(new Field(WORD, word.getText(), Store.YES, Index.UN_TOKENIZED)); + document.add(new Field(POPULARITY, word.getPopularity().toString(), Store.YES, Index.UN_TOKENIZED)); + if(word.getField() != null) + document.add(new Field(FIELD, word.getField(), Store.NO, Index.UN_TOKENIZED)); + Iterator iterator = analysers.iterator(); + while (iterator.hasNext()) { + LexiconAnalyzer dictionaryAnalyser = (LexiconAnalyzer) iterator.next(); + Field[] fields =dictionaryAnalyser.getFields(word.getTerm()); + for(int i = 0; i < fields.length; i++) + document.add(fields[i]); + } + writer.addDocument(document); + } + // close writer + writer.optimize(); + writer.close(); + // close reader so it will be re-opened (and see the new content) when exist() + // is called the next time: + if (reader != null) { + reader.close(); + reader = null; + } + // also re-open the spell index to see our own changes when the next suggestion + // is fetched: + searcher.close(); + searcher = new IndexSearcher(this.spellIndex); + } + + /** + * Remove a term + * @param term + */ + public void remove(Term term){ + + } + + /** + * end batch remove + */ + public void flush(){ + + } + /** + * Add an analyser for parsing indexed terms + * @param analyser + */ + public void addAnalyser(LexiconAnalyzer analyser) { + analysers.add(analyser); + } + + /** + * The smallest distance between two doc + * Levenshtein then any Distanceable implemention registered are used and smallest result are kept + * @param document1 + * @param document2 + * @return + */ + protected Integer distance(String word, Document document) { + if(word == null || document == null) + return null; + if(word.equals(document.get(WORD))) + return new Integer(0); + Iterator iterator = analysers.iterator(); + int distance = StringUtils.getLevenshteinDistance(word, document.get(WORD)); + while(iterator.hasNext()) { + LexiconAnalyzer analyzer = (LexiconAnalyzer)iterator.next(); + if(analyzer instanceof Distanceable) { + distance = Math.min(distance, + ((Distanceable)analyzer).distance(word, document)); + } + } + return new Integer(distance); + } + + /** + * Check whether the word exists in the index. + * @param word + * @throws IOException + * @return true if the word exists in the index + */ + public boolean exist(String word) throws IOException { + return getReader().docFreq(new Term(WORD, word)) > 0; + } + + public void empty(){ + } + + /** + * Find similar words + * @param word + * @param fields + * @param similars + * @return + * @throws IOException + */ + public String[] findSimilar(String word, String[] fields, SimilarAble[] similars) throws IOException { + BooleanQuery query = new BooleanQuery(); + if(fields != null && fields.length > 0) { + for(int a = 0; a < fields.length; a++) + query.add(new BooleanClause(new TermQuery( + new Term(FIELD, fields[a])), Occur.MUST)); + } + if(similars == null){ + similars = new SimilarAble[analysers.size()]; + Iterator iterator = analysers.iterator(); + int cpt = 0; + while(iterator.hasNext()){ + LexiconAnalyzer lexiconAnalyzer = (LexiconAnalyzer)iterator.next(); + if(lexiconAnalyzer instanceof SimilarAble) + similars[cpt++] = (SimilarAble)lexiconAnalyzer; + } + } + for(int a = 0; a < similars.length; a++) { + if(! analysers.contains(similars[a])) + throw new NotIndexedException((LexiconAnalyzer)similars[a]); + query.add(new BooleanClause(similars[a].findSimilar(word), Occur.MUST)); + } + Hits hits = getSearcher().search(query); + String[] words = new String[hits.length()]; + for(int a = 0; a < hits.length(); a++) + words[a] = hits.doc(a).get(WORD); + return words; + } + + public String[] findSimilar(String word) throws IOException { + return findSimilar(word, null, null); + } + + /** + * Find near words from a mispelled one. + * @param word + * @param fields + * @param nears + * @return + * @throws IOException + */ + public Suggestions findNear(String word, String[] fields, NearAble[] nears) throws IOException { + BooleanQuery query = new BooleanQuery(); + if(fields != null && fields.length > 0) { + for(int a = 0; a < fields.length; a++) + query.add(new BooleanClause(new TermQuery( + new Term(FIELD, fields[a])), Occur.MUST)); + } + if( nears == null) { + nears = new NearAble[analysers.size()]; + Iterator iterator = analysers.iterator(); + int cpt = 0; + while(iterator.hasNext()){ + LexiconAnalyzer lexiconAnalyzer = (LexiconAnalyzer)iterator.next(); + if(lexiconAnalyzer instanceof NearAble) + nears[cpt++] = (NearAble)lexiconAnalyzer; + } + } + for(int a = 0; a < nears.length; a++) { + if(! analysers.contains(nears[a])) + throw new NotIndexedException((LexiconAnalyzer)nears[a]); + query.add(new BooleanClause(nears[a].findNear(word), Occur.MUST)); + } + Hits hits = getSearcher().search(query); + Suggestions set = new Suggestions(); + for(int a = 0; a < hits.length(); a++){ + Document doc = hits.doc(a); + String suggestedWord = doc.get(WORD); + set.add(new Suggestive(suggestedWord, distance(word, doc), new Float(hits.score(a)), new Integer(Integer.parseInt(doc.get(POPULARITY))))); + } + return set; + } + + public Suggestions findNear(String word) throws IOException { + return findNear(word, null, null); + } + + public Suggestions findNear(Term term) throws IOException { + return findNear(term, null); + } + + public Suggestions findNear(Term term, NearAble[] nears) throws IOException { + return findNear(term.text(), new String[]{ term.field() }, nears); + } + + public Integer getBatchSize() { + return batchSize; + } + + public void setBatchSize(Integer batchSize) { + this.batchSize = batchSize; + } + +} \ No newline at end of file Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/LexiconReader.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/LexiconReader.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/LexiconReader.java (revision 0) @@ -0,0 +1,43 @@ +/** + * + */ +package org.apache.lucene.lexicon.reader; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.lexicon.DummyStream; + +/** + * @author mlecarme + * + */ +public abstract class LexiconReader { + protected TokenFilter tokenFilter = null; + private DummyStream stream = new DummyStream(); + private Word currentWord; + + public abstract Word next() throws IOException; + + public void setTokenFilter(TokenFilter tokenFilter) { + this.tokenFilter = tokenFilter; + } + + public TokenStream getStream() throws IOException { + return stream; + } + + public Word filteredNext() throws IOException { + if(tokenFilter == null) + return this.next(); + if(stream.isEmpty()) { + currentWord = this.next(); + if(currentWord == null) + return null; + stream.add(currentWord.getText()); + } + return new Word(currentWord.getField(), tokenFilter.next().termText(), currentWord.getPopularity()); + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/DirectoryReader.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/DirectoryReader.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/DirectoryReader.java (revision 0) @@ -0,0 +1,84 @@ +/** + * + */ +package org.apache.lucene.lexicon.reader; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; + +/** + * Lucene Directory reader to feed a Lexicon + * @author mlecarme + *
+ * [TODO] synchronisation lors d'une réindexation
+ * 
+ * + */ +public class DirectoryReader extends LexiconReader{ + private IndexReader reader; + private IndexSearcher searcher; + private Directory spellIndex; + private TermEnum terms; + private int thresold = 0; + private Set fieldNames; + + public void setThresold(int thresold) { + this.thresold = thresold; + } + + public void setFieldNames(String[] fieldNames) { + if(fieldNames != null) + this.fieldNames = new HashSet(Arrays.asList(fieldNames)); + } + + private void init(Directory spellIndex, String[] fieldNames) throws IOException, LockObtainFailedException { + setFieldNames(fieldNames); + this.spellIndex = spellIndex; + if (!IndexReader.indexExists(spellIndex)) { + IndexWriter writer = new IndexWriter(spellIndex, null, true); + writer.close(); + } + // close the old searcher, if there was one + if (searcher != null) { + this.searcher.close(); + } + this.searcher = new IndexSearcher(this.spellIndex); + this.reader = this.searcher.getIndexReader(); + this.terms = this.reader.terms(); + } + + public DirectoryReader(Directory spellIndex) throws IOException, LockObtainFailedException { + init(spellIndex, null); + } + + public DirectoryReader(Directory spellIndex, String[] fieldNames) throws IOException { + init(spellIndex, fieldNames); + } + + public Word next() throws IOException { + if(! this.terms.next()) + return null; + Term term = terms.term(); + int docFreq = reader.docFreq(term); + if(docFreq < thresold) + return next(); + if(fieldNames != null && ! fieldNames.contains(term.field())) + return next(); + return new Word(term, new Integer(docFreq)); + } + + public void seek() throws IOException { + this.terms = this.reader.terms(); + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/Word.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/Word.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/Word.java (revision 0) @@ -0,0 +1,70 @@ +package org.apache.lucene.lexicon.reader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.Term; + +/** + * A Word for Lexicon Entry + * @author Mathieu Lecarme + * + */ +public class Word { + public static final String SIMPLE = "simple"; + private String field = SIMPLE; + private String text; + private Integer popularity = new Integer(1); + + public Word(){} + + public Word(Term term, Integer occurency) { + text = term.text(); + field = term.field(); + this.popularity = occurency; + } + + public Word(String field, String text, Integer occurency) { + this.field = field; + this.text = text; + this.popularity = occurency; + } + + public Word(String text) { + this.text = text; + } + + public String getField() { + return field; + } + + public String getText() { + return text; + } + + public Integer getPopularity() { + return popularity; + } + + public Term getTerm() { + return new Term(getField(), getText()); + } + + public String toString() { + return ""; + } +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/SimpleReader.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/SimpleReader.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/reader/SimpleReader.java (revision 0) @@ -0,0 +1,45 @@ +/** + * + */ +package org.apache.lucene.lexicon.reader; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; + +/** + * @author Mathieu Lecarme + * + */ +public class SimpleReader extends LexiconReader { + private Iterator iterator; + private String field = "simple"; + + public SimpleReader(Iterator iterator) { + this.iterator = iterator; + } + + public SimpleReader(Collection collection) { + this.iterator = collection.iterator(); + } + + public SimpleReader(String[] strings) { + this.iterator = Arrays.asList(strings).iterator(); + } + + public Word next() throws IOException { + if(! iterator.hasNext()) + return null; + return new Word(field, (String)iterator.next(), new Integer(1)); + } + + public String getField() { + return field; + } + + public void setField(String field) { + this.field = field; + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/SuggestiveSearcher.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/SuggestiveSearcher.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/SuggestiveSearcher.java (revision 0) @@ -0,0 +1,75 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.Sort; + +/** + * @author Mathieu Lecarme + */ +public class SuggestiveSearcher extends AbstractSearcher{ + private int thresold = 10; + + /** + * @param searcher + */ + public SuggestiveSearcher(Searcher searcher, Lexicon lexicon) { + super(); + this.searcher = searcher; + this.lexicon = lexicon; + } + + /** + * @return the thresold + */ + public int getThresold() { + return thresold; + } + + /** + * @param thresold the thresold to set + */ + public void setThresold(int thresold) { + this.thresold = thresold; + } + + public SuggestiveHits searchWithSuggestions(Query query) throws IOException { + return searchWithSuggestions(query, null, null); + } + + public SuggestiveHits searchWithSuggestions(Query query, Filter filter) throws IOException { + return searchWithSuggestions(query, filter, null); + } + + public SuggestiveHits searchWithSuggestions(Query query, Sort sort) throws IOException { + return searchWithSuggestions(query, null, sort); + } + + public SuggestiveHits searchWithSuggestions(Query query, Filter filter, Sort sort) throws IOException { + Hits hits = this.searcher.search(query, filter, sort); + if(hits.length() > thresold) + return new SuggestiveHits(hits, null, null); + Set terms = QueryUtils.extractTerms(query); + if(terms == null) + return new SuggestiveHits(hits, null, null); + Map suggestions = new HashMap(); + Iterator iterator = terms.iterator(); + while(iterator.hasNext()) { + Term term = (Term)iterator.next(); + suggestions.put(term.text(), lexicon.findNear(term)); + } + return new SuggestiveHits(hits, suggestions, query); + } +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/TermQueryFilter.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/TermQueryFilter.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/TermQueryFilter.java (revision 0) @@ -0,0 +1,15 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; + +/** + * @author Mathieu Lecarme + * + */ +public interface TermQueryFilter { + public Query filter(TermQuery term); +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/StemmerAnalyzer.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/StemmerAnalyzer.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/StemmerAnalyzer.java (revision 0) @@ -0,0 +1,94 @@ +package org.apache.lucene.lexicon.analyzer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.lang.reflect.Method; + +import net.sf.snowball.SnowballProgram; + +/** + * Find words with same stemming + * @see net.sf.snowball.SnowballProgram + * @author Mathieu Lecarme + * + */ +public class StemmerAnalyzer extends AbstractSimplifierAnalyzer{ + private static final Object [] EMPTY_ARGS = new Object[0]; + public static final String KEY = "stemmer."; + public static final String STEM = KEY + "stem"; + private SnowballProgram stemmer; + private Method stemMethod; + + public StemmerAnalyzer(String name) { + try { + Class stemClass = + Class.forName("net.sf.snowball.ext." + name + "Stemmer"); + stemmer = (SnowballProgram) stemClass.newInstance(); + // why doesn't the SnowballProgram class have an (abstract?) stem method? + stemMethod = stemClass.getMethod("stem", new Class[0]); + } catch (Exception e) { + throw new RuntimeException(e.toString()); + } + } + + /** + * @param word + * @return stemmed word + */ + public String simplify(String word){ + stemmer.setCurrent(word); + try { + stemMethod.invoke(stemmer, EMPTY_ARGS); + } catch (Exception e) { + throw new RuntimeException(e); + } + return stemmer.getCurrent(); + } + + public String getKey() { + return KEY; + } + + public String getSimpleKey() { + return STEM; + } + + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((stemmer == null) ? 0 : stemmer.getClass().hashCode()); + return result; + } + + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final StemmerAnalyzer other = (StemmerAnalyzer) obj; + if (stemmer == null) { + if (other.stemmer != null) + return false; + } else if (!stemmer.getClass().equals(other.stemmer.getClass())) + return false; + return true; + } + +} \ No newline at end of file Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/SimilarAble.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/SimilarAble.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/SimilarAble.java (revision 0) @@ -0,0 +1,16 @@ +/** + * + */ +package org.apache.lucene.lexicon.analyzer; + +import org.apache.lucene.search.Query; + +/** + * Can find similar token + * @author mlecarme + * + */ +public interface SimilarAble { + public Query findSimilar(String word); + public String simplify(String word); +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/LexiconAnalyzer.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/LexiconAnalyzer.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/LexiconAnalyzer.java (revision 0) @@ -0,0 +1,40 @@ +package org.apache.lucene.lexicon.analyzer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.document.Field; +import org.apache.lucene.index.Term; + +/** + * A builder for lexicon entry. + * @author Mathieu Lecarme + * + */ +public interface LexiconAnalyzer { + /** + * for each Term indexed, n fields can be return. + * @param term + * @return + */ + public Field[] getFields(Term term); + + /** + * @return the distinct key for the Field. + */ + public String getKey(); +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/WordNetAnalyzer.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/WordNetAnalyzer.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/WordNetAnalyzer.java (revision 0) @@ -0,0 +1,36 @@ +/** + * + */ +package org.apache.lucene.lexicon.analyzer; + +import org.apache.lucene.document.Field; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; + +/** + * @author mlecarme + * + */ +public class WordNetAnalyzer implements LexiconAnalyzer, SimilarAble{ + + public Field[] getFields(Term term) { + // TODO Auto-generated method stub + return null; + } + + public String getKey() { + // TODO Auto-generated method stub + return null; + } + + public Query findSimilar(String word) { + // TODO Auto-generated method stub + return null; + } + + public String simplify(String word) { + // TODO Auto-generated method stub + return null; + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/AphoneAnalyser.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/AphoneAnalyser.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/AphoneAnalyser.java (revision 0) @@ -0,0 +1,100 @@ +package org.apache.lucene.lexicon.analyzer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.lang.StringUtils; +import org.apache.lucene.aphone.Aphone; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; + +/** + * @author Mathieu Lecarme + * + */ +public class AphoneAnalyser extends NGramAnalyzer implements Distanceable, SimilarAble { + private Aphone aphone; + public static final String KEY = "aphone."; + public static final String PHONEM = KEY + "phonem"; + + /** + * @param aphone + */ + public AphoneAnalyser(Aphone aphone) { + this.aphone = aphone; + } + + public String getKey() { + return KEY; + } + + public Field[] getFields(Term term) { + List fields = new ArrayList(); + String phoneme = aphone.toPhone(term.text()); + addNgram(phoneme, fields); + fields.add( + new Field(PHONEM, phoneme, Store.YES, Index.UN_TOKENIZED)); + Field[] result = new Field[fields.size()]; + fields.toArray(result); + return result; + } + + public Query findSimilar(String word) { + return new TermQuery( + new Term(PHONEM, simplify(word))); + } + + public String simplify(String word) { + return aphone.toPhone(word); + } + + public int distance(String word, Document doc) { + return StringUtils.getLevenshteinDistance(aphone.toPhone(word), doc.get(PHONEM)); + } + + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((aphone == null) ? 0 : aphone.getClass().hashCode()); + return result; + } + + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final AphoneAnalyser other = (AphoneAnalyser) obj; + if (aphone == null) { + if (other.aphone != null) + return false; + } else if (!aphone.getClass().equals(other.aphone.getClass())) + return false; + return true; + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/NearAble.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/NearAble.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/NearAble.java (revision 0) @@ -0,0 +1,15 @@ +/** + * + */ +package org.apache.lucene.lexicon.analyzer; + +import org.apache.lucene.search.Query; + +/** + * Can be use for a "do you mean" suggestion + * @author Mathieu Lecarme + * + */ +public interface NearAble { + public Query findNear(String word); +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/NGramAnalyzer.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/NGramAnalyzer.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/NGramAnalyzer.java (revision 0) @@ -0,0 +1,171 @@ +package org.apache.lucene.lexicon.analyzer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreRangeQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; + +/** + * @see org.apache.lucene.analysis.ngram.NGramTokenizer + * @author Mathieu Lecarme + */ +public class NGramAnalyzer implements LexiconAnalyzer, NearAble { + public static final String KEY = "ngram."; + public static final String SIZE = "size"; + public static final String GRAM = "gram"; + public static final String START = "start"; + public static final String END = "end"; + protected int min = 2; + protected int max = 2; + protected int delta = 1; // delta in size for search + + public NGramAnalyzer() {} + + /** + * @param min + * @param max + */ + public NGramAnalyzer(int min, int max, int delta) { + this.min = min; + this.max = max; + this.delta = delta; + } + + public String getKey() { + return KEY; + } + + protected Set ngram(String word) { + Set set = new LinkedHashSet(); + NGramTokenizer tokenizer = new NGramTokenizer(new StringReader(word), min, max); + Token token; + for(;;){ + try { + token = tokenizer.next(); + } catch (IOException e) { + throw new RuntimeException(e); + } + if(token == null) + break; + set.add(token.termText()); + } + return set; + } + + protected void addNgram(String word, List fields) { + Iterator iterator = ngram(word).iterator(); + String currentGram = null; + while(iterator.hasNext()) { + String gram = (String)iterator.next(); + if(currentGram == null) + fields.add( + new Field(getKey() + START, gram, Store.NO, Index.UN_TOKENIZED) + ); + currentGram = gram; + fields.add( + new Field(getKey() + GRAM, gram, Store.NO, Index.UN_TOKENIZED) + ); + } + fields.add( + new Field(getKey() + END, currentGram, Store.NO, Index.UN_TOKENIZED) + ); + fields.add( + new Field(getKey() + SIZE, new Integer(word.length()).toString(), Store.NO, Index.UN_TOKENIZED) + ); + } + + public Field[] getFields(Term term) { + List fields = new ArrayList(); + addNgram(term.text(), fields); + Field[] result = new Field[fields.size()]; + fields.toArray(result); + return result; + } + + public Query findNear(String word) { + BooleanQuery query = new BooleanQuery(); + query.add(new ConstantScoreRangeQuery( + getKey() + SIZE, + new Integer(word.length() -1).toString(), + new Integer(word.length() + 1).toString(), + true, true), Occur.MUST); + BooleanQuery or = new BooleanQuery(); + Iterator iterator = ngram(word).iterator(); + String currentGram = null; + while(iterator.hasNext()) { + String gram = (String)iterator.next(); + if(currentGram == null) + or.add(new TermQuery( + new Term(getKey() + START, gram)), + Occur.SHOULD); + currentGram = gram; + or.add(new TermQuery( + new Term(getKey() + GRAM , gram)), + Occur.SHOULD); + } + or.add(new TermQuery( + new Term(getKey() + END, currentGram)), + Occur.SHOULD); + query.add(or, Occur.MUST); + return query; + } + + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + delta; + result = prime * result + max; + result = prime * result + min; + return result; + } + + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final NGramAnalyzer other = (NGramAnalyzer) obj; + if (delta != other.delta) + return false; + if (max != other.max) + return false; + if (min != other.min) + return false; + return true; + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/AnagramAnalyzer.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/AnagramAnalyzer.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/AnagramAnalyzer.java (revision 0) @@ -0,0 +1,45 @@ +/** + * + */ +package org.apache.lucene.lexicon.analyzer; + +import java.util.Iterator; +import java.util.SortedSet; +import java.util.TreeSet; + +/** + * Anagram analyzer + * Each distinct chars of a word are sorted. + * "Lucene" become "celnu" + * @author Mathieu Lecarme + * @see {http://en.wikipedia.org/wiki/Anagram} + */ +public class AnagramAnalyzer extends AbstractSimplifierAnalyzer { + public static final String KEY = "anagram."; + public static final String ANAGRAM = KEY + "anagram"; + + /** + * @param word + * @return the anagramed word + */ + public String simplify(String word) { + SortedSet set = new TreeSet(); + char[] chars = word.toCharArray(); + for(int a = 0; a < chars.length; a++) + set.add(new Character(Character.toLowerCase(chars[a]))); + StringBuffer buffer = new StringBuffer(set.size()); + Iterator iterator = set.iterator(); + while(iterator.hasNext()) + buffer.append(iterator.next()); + return buffer.toString(); + } + + public String getKey() { + return KEY; + } + + public String getSimpleKey() { + return ANAGRAM; + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/AbstractSimplifierAnalyzer.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/AbstractSimplifierAnalyzer.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/AbstractSimplifierAnalyzer.java (revision 0) @@ -0,0 +1,37 @@ +/** + * + */ +package org.apache.lucene.lexicon.analyzer; + +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; + +/** + * @author Mathieu Lecarme + * + */ +public abstract class AbstractSimplifierAnalyzer implements LexiconAnalyzer, SimilarAble, NearAble{ + public abstract String getSimpleKey(); + public abstract String simplify(String word); + + public Field[] getFields(Term term) { + if(term == null) + return null; + return new Field[] { + new Field(getSimpleKey(), simplify(term.text()), Store.NO, Index.UN_TOKENIZED)}; + } + + public Query findSimilar(String word) { + return new TermQuery( + new Term(getSimpleKey(), simplify(word))); + } + + public Query findNear(String word) { + return findSimilar(word); + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/Distanceable.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/Distanceable.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/analyzer/Distanceable.java (revision 0) @@ -0,0 +1,12 @@ +package org.apache.lucene.lexicon.analyzer; + +import org.apache.lucene.document.Document; + +/** + * This LexiconAnalyzer computes the distance between suggested word + * @see LexiconAnalyzer + * @author Mathieu Lecarme + */ +public interface Distanceable { + public int distance(String word, Document doc); +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/Suggestive.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/Suggestive.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/Suggestive.java (revision 0) @@ -0,0 +1,134 @@ +package org.apache.lucene.lexicon; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * A suggested word + * @see Suggestions + * @author Mathieu Lecarme + */ +public class Suggestive implements Comparable{ + private String word; + private Integer distance; + private Float score; + private Integer popularity; + + /** + * @param word + * @param distance + * @param popularity + */ + public Suggestive(String word, Integer distance, Float score, Integer popularity) { + super(); + this.word = word; + this.distance = distance; + this.score= score; + this.popularity = popularity; + } + + public void setWord(String word) { + this.word = word; + } + + public String getWord() { + return word; + } + + public void setDistance(Integer distance) { + this.distance = distance; + } + + public Integer getDistance() { + return distance; + } + + public void setPopularity(Integer popularity) { + this.popularity = popularity; + } + + public Integer getPopularity() { + return popularity; + } + + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + + ((distance == null) ? 0 : distance.hashCode()); + result = prime * result + + ((popularity == null) ? 0 : popularity.hashCode()); + result = prime * result + ((word == null) ? 0 : word.hashCode()); + return result; + } + + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + final Suggestive other = (Suggestive) obj; + if (distance == null) { + if (other.distance != null) + return false; + } else if (!distance.equals(other.distance)) + return false; + if (popularity == null) { + if (other.popularity != null) + return false; + } else if (!popularity.equals(other.popularity)) + return false; + if (word == null) { + if (other.word != null) + return false; + } else if (!word.equals(other.word)) + return false; + return true; + } + + public int compareTo(Object arg0) { + if(arg0 == null) + return -1; + Suggestive other = (Suggestive)arg0; + if(!getDistance().equals(other.getDistance())) + return getDistance().compareTo(other.getDistance()); + if(!getScore().equals(other.getScore())) + return getScore().compareTo(other.getScore()); + if(! getPopularity().equals(other.getPopularity())) + return getPopularity().compareTo(other.getPopularity()); + return getWord().compareTo(other.getWord()); + } + + /** + * @return the score + */ + public Float getScore() { + return score; + } + + /** + * @param score the score to set + */ + public void setScore(Float score) { + this.score = score; + } + + public String toString() { + return ""; + } +} \ No newline at end of file Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/QueryUtils.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/QueryUtils.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/QueryUtils.java (revision 0) @@ -0,0 +1,82 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; + +/** + * @author Mathieu Lecarme + * + */ +public final class QueryUtils { + /** + * extract all Term from TermQuery, recursively + * @param query + * @return + */ + public static final Set extractTerms(Query query) { + Set terms = new HashSet(); + if(query instanceof TermQuery) { + terms.add(((TermQuery)query).getTerm()); + return terms; + } + if(query instanceof BooleanQuery){ + Iterator iterator = ((BooleanQuery)query).clauses().iterator(); + while(iterator.hasNext()) { + Set subTerms = extractTerms(((BooleanClause)iterator.next()).getQuery()); + if(subTerms != null) + terms.addAll(subTerms); + } + return terms; + } + return null; + } + + public static final Query filter(Query query, TermQueryFilter filter) { + if(query instanceof TermQuery) + return filter.filter((TermQuery)query); + if(! (query instanceof BooleanQuery)) + return query; + BooleanQuery filteredQuery = new BooleanQuery(); + filteredQuery.setBoost(query.getBoost()); + Iterator iterator = ((BooleanQuery)query).clauses().iterator(); + while(iterator.hasNext()){ + BooleanClause clause = (BooleanClause)iterator.next(); + clause.setQuery(QueryUtils.filter((Query)clause.getQuery().clone(), filter)); + filteredQuery.add(clause); + } + return filteredQuery; + } + + public static Query buildSimilarQuery(Query query, final Lexicon lexicon, final float coeff) { + return filter(query, new TermQueryFilter() { + public Query filter(TermQuery term) { + String[] similars; + try { + similars = lexicon.findSimilar(term.getTerm().text()); + } catch (IOException e) { + throw new RuntimeException(e); + } + BooleanQuery similarQuery = new BooleanQuery(); + similarQuery.add(term, Occur.MUST); + for(int a=0; a < similars.length; a++){ + TermQuery tq = new TermQuery(new Term(term.getTerm().field(), similars[a])); + tq.setBoost(coeff * term.getBoost()); + similarQuery.add(tq, Occur.SHOULD); + } + return similarQuery; + } + }); + } +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/DummyStream.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/DummyStream.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/DummyStream.java (revision 0) @@ -0,0 +1,66 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +/** + * A dummy token stream + * Useful for TokenFilter with String input. + * @see org.apache.lucene.analysis.TokenStream + * @author Mathieu Lecarme + */ +public class DummyStream extends TokenStream { + private List words = new ArrayList(); + + public DummyStream() {} + + /** + * @param word + */ + public DummyStream(String word) { + super(); + this.add(word); + } + + public DummyStream(Collection words) { + super(); + this.addAll(words); + } + + public void add(String word) { + this.words.add(word); + } + + public void addAll(Collection words) { + this.words.addAll(words); + } + + public boolean isEmpty(){ + return words.isEmpty(); + } + + public void clear(){ + words.clear(); + } + + public int size() { + return words.size(); + } + + public Token next() throws IOException { + if(words.isEmpty()) + return null; + Token token = new Token((String)words.get(0), 0, 0); + words.remove(0); + return token; + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/SimilarTokenFilter.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/SimilarTokenFilter.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/SimilarTokenFilter.java (revision 0) @@ -0,0 +1,70 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.io.IOException; +import java.util.Stack; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.lexicon.analyzer.SimilarAble; + +/** + * Generic token filter wich provide similar token + * @author Mathieu Lecarme + * + */ +public class SimilarTokenFilter extends TokenFilter{ + private Lexicon lexicon; + private Stack stack = new Stack(); + private String[] fields = null; + private SimilarAble[] similars = null; + private Token currentToken; + + /** + * A simple similar token filter + * @param input a stream + * @param lexicon + */ + protected SimilarTokenFilter(TokenStream input, Lexicon lexicon) { + super(input); + this.lexicon = lexicon; + } + + /** + * A complete similar token filter. Only fields used during the lexicon + * construction and Similarable analyzer are used for this similarity search + * @param input + * @param lexicon + * @param fields wich are used during the indexation + * @param similars used during the search. + */ + protected SimilarTokenFilter(TokenStream input, Lexicon lexicon, String[] fields, SimilarAble[] similars) { + super(input); + this.lexicon = lexicon; + this.fields = fields; + this.similars = similars; + } + + public Token next() throws IOException { + if(stack.isEmpty()){ + currentToken = this.input.next(); + if(currentToken == null) + return null; + String word = currentToken.termText(); + String[] sims = lexicon.findSimilar(word, fields, similars); + for(int a=0; a < sims.length; a++){ + String similar = sims[a]; + stack.add(similar); + if( similar.equals(word)) + word = null; + } + if(word != null) + stack.add(word); + } + return new Token((String)stack.pop(), currentToken.startOffset(), currentToken.endOffset()); + } + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/Suggestions.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/Suggestions.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/Suggestions.java (revision 0) @@ -0,0 +1,71 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.util.Iterator; +import java.util.TreeSet; + +/** + * A sorted set of suggestion, with a max size, and a distanceMax + * @author Mathieu Lecarme + */ +public class Suggestions extends TreeSet { + private static final long serialVersionUID = 4038368090229602687L; + private int distanceMax = 3; + private int sizeMax = 10; + + public boolean add(Object arg0) { + Suggestive suggestive = (Suggestive)arg0; + if(suggestive.getDistance().intValue() > distanceMax) + return false; + boolean modif = super.add(arg0); + if(modif && size() > sizeMax) + remove(last()); + return modif; + } + + /** + * @return the distanceMax + */ + public int getDistanceMax() { + return distanceMax; + } + + /** + * @param distanceMax the distanceMax to set + */ + public void setDistanceMax(int distanceMax) { + this.distanceMax = distanceMax; + } + + /** + * @return the sizeMax + */ + public int getSizeMax() { + return sizeMax; + } + + /** + * @param sizeMax the sizeMax to set + */ + public void setSizeMax(int sizeMax) { + this.sizeMax = sizeMax; + } + + public Iterator getWordIterator() { + final Iterator iterator = iterator(); + return new Iterator(){ + public boolean hasNext(){ + return iterator.hasNext(); + } + public Object next() { + return ((Suggestive)iterator.next()).getWord(); + } + public void remove() { + iterator.remove(); + } + }; + } + +} \ No newline at end of file Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/AbstractTextTermQueryFilter.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/AbstractTextTermQueryFilter.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/AbstractTextTermQueryFilter.java (revision 0) @@ -0,0 +1,26 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; + +/** + * @author mlecarme + * + */ +public abstract class AbstractTextTermQueryFilter implements TermQueryFilter{ + + public Query filter(TermQuery termQuery) { + Term term = termQuery.getTerm(); + Term filteredTerm = new Term(term.field(), filter(term.text())); + TermQuery filteredTermQuery = new TermQuery(filteredTerm); + filteredTermQuery.setBoost(termQuery.getBoost()); + return filteredTermQuery; + } + + public abstract String filter(String txt); + +} Index: contrib/lexicon/src/java/org/apache/lucene/lexicon/SuggestiveHits.java =================================================================== --- contrib/lexicon/src/java/org/apache/lucene/lexicon/SuggestiveHits.java (revision 0) +++ contrib/lexicon/src/java/org/apache/lucene/lexicon/SuggestiveHits.java (revision 0) @@ -0,0 +1,67 @@ +/** + * + */ +package org.apache.lucene.lexicon; + +import java.util.Map; + +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.Query; + +/** + * @author mlecarme + * + */ +public class SuggestiveHits { + private Hits hits; + private Map suggestions;// String -> Suggestions + private Query query; + + /** + * @param hits + * @param suggestions + */ + public SuggestiveHits(Hits hits, Map suggestions, Query query) { + this.hits = hits; + this.suggestions = suggestions; + this.query = query; + } + + /** + * @return the hits + */ + public Hits getHits() { + return hits; + } + + /** + * @return the suggestions + */ + public Suggestions getSuggestions(String word) { + return (Suggestions)suggestions.get(word); + } + + public Map getSuggestions() { + return suggestions; + } + + public String getSuggestedQuery() { + if(! isSuggested()) + return null; + return QueryUtils.filter(this.query, new AbstractTextTermQueryFilter() { + public String filter(String txt) { + if(! suggestions.containsKey(txt)) + return txt; + Suggestions suggested = (Suggestions)suggestions.get(txt); + if(suggested.isEmpty()) + return txt; + return (String)suggested.getWordIterator().next(); + } + }).toString(); + } + + public boolean isSuggested() { + return suggestions != null && suggestions.size() > 0; + } + +} Index: contrib/lexicon/pom.xml =================================================================== Index: contrib/lexicon/build.xml =================================================================== --- contrib/lexicon/build.xml (revision 0) +++ contrib/lexicon/build.xml (revision 0) @@ -0,0 +1,84 @@ + + + + + + + + Lexicon + + + + + + + + + + + + + + + + + + + + + + + + + + + XML Parser building dependency ${spellchecker.jar} + + + + + XML Parser building dependency ${aphone.jar} + + + + + XML Parser building dependency ${snowball.jar} + + + + + XML Parser building dependency ${analyzers.jar} + + + + + + + + + + Index: contrib/aphone/src/test/org/apache/lucene/aphone/TestAphoneTokenFilter.java =================================================================== --- contrib/aphone/src/test/org/apache/lucene/aphone/TestAphoneTokenFilter.java (revision 0) +++ contrib/aphone/src/test/org/apache/lucene/aphone/TestAphoneTokenFilter.java (revision 0) @@ -0,0 +1,46 @@ +package org.apache.lucene.aphone; + + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * + * @Author Mathieu Lecarme + */ +public class TestAphoneTokenFilter extends TestCase{ + public void testFrench() throws IOException{ + String test = "Les phonétiques vont sauver le monde"; + TokenStream stream = new AphoneTokenFilter ( new StandardTokenizer(new StringReader(test)), new AphoneFr()); + Token tok; + for(;;){ + tok = stream.next(); + if(tok == null) + break; + System.out.println(tok.termText()); + } + } +} Index: contrib/aphone/src/test/org/apache/lucene/aphone/TestAphone.java =================================================================== --- contrib/aphone/src/test/org/apache/lucene/aphone/TestAphone.java (revision 0) +++ contrib/aphone/src/test/org/apache/lucene/aphone/TestAphone.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; + +/** + * + * @Author Mathieu Lecarme + */ +public class TestAphone extends TestCase { + + public void testFrench() { + AphoneFr fr = new AphoneFr(); + assertEquals("LUSEME", fr.toPhone("Lucene")); + } + + public void testEnglish() { + AphoneEn en = new AphoneEn(); + assertEquals("LSN", en.toPhone("Lucene")); + } + +} \ No newline at end of file Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneDa.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneDa.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneDa.java (revision 0) @@ -0,0 +1,475 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * String to phone conversion + * @Author Mathieu Lecarme + */ +public class AphoneDa extends Aphone{ + private StringBuffer phone; + private String word; + private boolean starting; + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(String word) { + if(word == null) + return null; + this.phone = new StringBuffer(); + this.word = word.toUpperCase(); + this.starting = true; + while( this.word.length() > 0 ) { + this.eat(); + this.starting = false; + } + return this.phone.toString(); + } + + private void append(String letter) { + if (letter.length() > 0) { + if (this.phone.length() > 0 && letter.charAt(0) == this.phone.charAt(this.phone.length() - 1)) { + if (letter.length() > 1) + this.phone.append(letter.substring(1)); + } else + this.phone.append(letter); + } + } + + private boolean eat() { + if( word.length() >= 2 && word.substring(0, 2).equals("AA") ) { + this.append("Å"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("ACTION") ) { + this.append("AKSJON"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AF") ) { + this.append("AV"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ASIE") ) { + this.append("ASJE"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("A") ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("BEDST") ) { + this.append("BEST"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("BORD") ) { + this.append("BOR"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("BRYST") ) { + this.append("BRØST"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("BUREAU") ) { + this.append("BYRO"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("B") ) { + this.append("B"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CC") ) { + this.append("KS"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CK") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CH") ) { + this.append("TJ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CI") ) { + this.append("SI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CO") ) { + this.append("KO"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CY") ) { + this.append("SY"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("C") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("DIG") && word.length() == 3) { + this.append("DAJ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("DIG") ) { + this.append("DI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("D") && word.length() == 1) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("D") ) { + this.append("D"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("EAUX") ) { + this.append("O"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EAU") ) { + this.append("O"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EJ") && word.length() == 2) { + this.append("AJ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EU") ) { + this.append("ØV"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("E") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("É") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("È") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("FEDT") ) { + this.append("FET"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("F") ) { + this.append("F"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("G") ) { + this.append("G"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("HJ") ) { + this.append("J"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("HÅRD") ) { + this.append("HÅR"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("HÅND") ) { + this.append("HÅN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("H") ) { + this.append("H"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ION") ) { + this.append("JON"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("IND") ) { + this.append("IN"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("I") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("J") ) { + this.append("J"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("K") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("LIG") ) { + this.append("LI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("L") ) { + this.append("L"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("MAND") ) { + this.append("MAN"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("MIG") && word.length() == 3) { + this.append("MAJ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("M") ) { + this.append("M"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("N") ) { + this.append("N"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("OST") ) { + this.append("ÅST"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("O") ) { + this.append("O"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ó") ) { + this.append("O"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("PH") ) { + this.append("F"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("P") ) { + this.append("P"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Q") ) { + this.append("KU"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("REGN") ) { + this.append("REJN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("RUG") ) { + this.append("RU"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("RYG") ) { + this.append("RØG"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("R") ) { + this.append("R"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("SH") ) { + this.append("SJ"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("SIG") && word.length() == 3) { + this.append("SAJ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("SKIND") ) { + this.append("SKIN"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("S'S") && word.length() == 3) { + this.append("S"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("S") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("TION") ) { + this.append("SJON"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("TZ") ) { + this.append("TS"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("T") ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("U") ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ü") ) { + this.append("Y"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("V") ) { + this.append("V"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("W") ) { + this.append("V"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("X'S") ) { + this.append("KS"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("X") ) { + this.append("KS"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("YKK") ) { + this.append("ØKK"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("YND") ) { + this.append("ØND"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Y") ) { + this.append("Y"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("Z'S") ) { + this.append("S"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Z") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Æ") ) { + this.append("Æ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ä") ) { + this.append("Æ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ØB") ) { + this.append("ØV"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ø") ) { + this.append("Ø"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ö") ) { + this.append("Ø"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Å") ) { + this.append("Å"); + this.word = this.word.substring(1); + return true; + } + + this.word = this.word.substring(1); + return false; + } + /** + * Simple test + */ + public static void main(String[] args) { + AphoneDa aphone = new AphoneDa(); + for(int i = 0; i < args.length; i++) { + System.out.println(aphone.toPhone(args[i])); + } + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneFo.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneFo.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneFo.java (revision 0) @@ -0,0 +1,435 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * String to phone conversion + * @Author Mathieu Lecarme + */ +public class AphoneFo extends Aphone{ + private StringBuffer phone; + private String word; + private boolean starting; + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(String word) { + if(word == null) + return null; + this.phone = new StringBuffer(); + this.word = word.toUpperCase(); + this.starting = true; + while( this.word.length() > 0 ) { + this.eat(); + this.starting = false; + } + return this.phone.toString(); + } + + private void append(String letter) { + if (letter.length() > 0) { + if (this.phone.length() > 0 && letter.charAt(0) == this.phone.charAt(this.phone.length() - 1)) { + if (letter.length() > 1) + this.phone.append(letter.substring(1)); + } else + this.phone.append(letter); + } + } + + private boolean eat() { + if( word.length() >= 2 && word.substring(0, 2).equals("AA") ) { + this.append("Å"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("AFT") ) { + this.append("AT"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AH") && word.length() == 2) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("A") ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Á") ) { + this.append("Á"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("B") ) { + this.append("B"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CC") ) { + this.append("KK"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CK") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("CHR") ) { + this.append("KR"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CH") ) { + this.append("SJ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CI") ) { + this.append("SI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CO") ) { + this.append("KO"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CY") ) { + this.append("SY"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("C") ) { + this.append("C"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("D") ) { + this.append("D"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ÐUR") ) { + this.append("VUR"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ð") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("EAUX") ) { + this.append("O"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EAU") ) { + this.append("O"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EUS") ) { + this.append("ØVS"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("E") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("É") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("È") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("F") ) { + this.append("F"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("G") ) { + this.append("G"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("HJ") ) { + this.append("J"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("HÅRD") ) { + this.append("HÅR"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("HÅND") ) { + this.append("HÅN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("H") ) { + this.append("H"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("IÐ") && word.length() == 2) { + this.append("Í"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("IÐ") && word.length() == 2) { + this.append("I"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("IND") ) { + this.append("IN"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("I") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Í") ) { + this.append("Í"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("J") ) { + this.append("J"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("KE") ) { + this.append("TJE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("K") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("LIG") ) { + this.append("LI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("L") ) { + this.append("L"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("M") ) { + this.append("M"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("N") ) { + this.append("N"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("OCH") ) { + this.append("OK"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("O") ) { + this.append("O"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ó") ) { + this.append("Ó"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("PH") ) { + this.append("F"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("P") ) { + this.append("P"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Q") ) { + this.append("KU"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("R") ) { + this.append("R"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("SH") ) { + this.append("SJ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("SIÓN") ) { + this.append("SJÓN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("S'S") && word.length() == 3) { + this.append("S"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("S") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("TH") && word.length() == 2) { + this.append("T"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("TIÓN") ) { + this.append("SJÓN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("T") ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("U") ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ú") ) { + this.append("Ú"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ü") ) { + this.append("Y"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("V") ) { + this.append("V"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("W") ) { + this.append("V"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("X'S") && word.length() == 3) { + this.append("KS"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("X") ) { + this.append("KS"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Y") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ý") ) { + this.append("Í"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("Z'S") && word.length() == 3) { + this.append("S"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Z") && word.length() == 1) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Z") ) { + this.append("Z"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Æ") ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ä") ) { + this.append("Æ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ØRN") ) { + this.append("ØDN"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ø") ) { + this.append("Ø"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ö") ) { + this.append("Ø"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Å") ) { + this.append("Á"); + this.word = this.word.substring(1); + return true; + } + + this.word = this.word.substring(1); + return false; + } + /** + * Simple test + */ + public static void main(String[] args) { + AphoneFo aphone = new AphoneFo(); + for(int i = 0; i < args.length; i++) { + System.out.println(aphone.toPhone(args[i])); + } + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneRu.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneRu.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneRu.java (revision 0) @@ -0,0 +1,595 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * String to phone conversion + * @Author Mathieu Lecarme + */ +public class AphoneRu extends Aphone{ + private StringBuffer phone; + private String word; + private boolean starting; + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(String word) { + if(word == null) + return null; + this.phone = new StringBuffer(); + this.word = word.toUpperCase(); + this.starting = true; + while( this.word.length() > 0 ) { + this.eat(); + this.starting = false; + } + return this.phone.toString(); + } + + private void append(String letter) { + if (letter.length() > 0) { + if (this.phone.length() > 0 && letter.charAt(0) == this.phone.charAt(this.phone.length() - 1)) { + if (letter.length() > 1) + this.phone.append(letter.substring(1)); + } else + this.phone.append(letter); + } + } + + private boolean eat() { + if( word.length() >= 2 && word.substring(0, 1).equals("А") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("АЯ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("А") ) { + this.append("А"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Б") && "БП".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Б") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("БА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("БЬ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("БЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("БЪ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("БЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Б") ) { + this.append("П"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("В") && "ВФ".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("В") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ВА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ВЬ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ВЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ВЪ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ВЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("В") ) { + this.append("Ф"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Г") && "ГК".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Г") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ГА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ГЬ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ГЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ГЪ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ГЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Г") ) { + this.append("К"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Д") && "ДТ".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Д") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ДА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ДЬ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ДЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ДЪ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ДЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ДЗ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ДЗА"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Д") && "ЗС".indexOf(word.charAt(1)) != -1 ) { + this.append("Ц"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Д") ) { + this.append("Т"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("Е") ) { + this.append("Я"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Е") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("ЯЯ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Е") ) { + this.append("Я"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("Ё") ) { + this.append("Я"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ё") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("ЯЯ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ё") ) { + this.append("Я"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ж") && "ЖШ".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ж") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ЖА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЖЬ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ЖЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЖЪ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ЖЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ж") ) { + this.append("Ш"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("З") && "ЗС".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("З") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ЗА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЗЬ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ЗЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЗЪ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ЗЯ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("З") ) { + this.append("С"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("И") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("АЯ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("И") ) { + this.append("А"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЙЙ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Й") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("Я"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Й") ) { + this.append("Й"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("К") && "ГК".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("К") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("КА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("К") ) { + this.append("К"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЛЛ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Л") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ЛА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Л") ) { + this.append("Л"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ММ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("М") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("МА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("М") ) { + this.append("М"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("НН") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Н") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("НА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Н") ) { + this.append("Н"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("О") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("АЯ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("О") ) { + this.append("А"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("П") && "БП".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("П") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ПА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("П") ) { + this.append("П"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("РР") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Р") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("РА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Р") ) { + this.append("Р"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("С") && "ЗС".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("С") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("СА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("С") ) { + this.append("С"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Т") && "ДТ".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Т") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ТА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ТЗ") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(2)) != -1 ) { + this.append("ТЗА"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Т") && "ЗС".indexOf(word.charAt(1)) != -1 ) { + this.append("Ц"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Т") ) { + this.append("Т"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("У") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("АЯ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("У") ) { + this.append("А"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ф") && "ВФ".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ф") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ФА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ф") ) { + this.append("Ф"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ХХ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Х") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ХА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Х") ) { + this.append("Х"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЦЦ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ц") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ЦА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ц") ) { + this.append("Ц"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЧЧ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ч") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ЧА"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ч") ) { + this.append("Ч"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ш") && "ЖШ".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ш") && "АОЭЫУЯЁЕИЮ".indexOf(word.charAt(1)) != -1 ) { + this.append("ША"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ш") ) { + this.append("Ш"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Щ") ) { + this.append("Ш"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ъ") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("Я"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ъ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ы") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("АЯ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ы") ) { + this.append("А"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ь") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("Я"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ь") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Э") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("АЯ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Э") ) { + this.append("А"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("Ю") ) { + this.append("Я"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ю") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("ЯЯ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ю") ) { + this.append("Я"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("Я") ) { + this.append("Я"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Я") && "ЕЁЮЯ".indexOf(word.charAt(1)) != -1 ) { + this.append("ЯЯ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Я") ) { + this.append("Я"); + this.word = this.word.substring(1); + return true; + } + + this.word = this.word.substring(1); + return false; + } + /** + * Simple test + */ + public static void main(String[] args) { + AphoneRu aphone = new AphoneRu(); + for(int i = 0; i < args.length; i++) { + System.out.println(aphone.toPhone(args[i])); + } + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneTokenFilter.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneTokenFilter.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneTokenFilter.java (revision 0) @@ -0,0 +1,45 @@ +package org.apache.lucene.aphone; + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +public class AphoneTokenFilter extends TokenFilter { + private Aphone aphone; + protected AphoneTokenFilter(TokenStream input) { + super(input); + this.aphone = new AphoneEn(); + } + public AphoneTokenFilter(TokenStream input, Aphone aphone){ + super(input); + this.aphone = aphone; + } + public Token next() throws IOException { + Token t = input.next(); + if (t == null) + return null; + //[FIXME] using #termBuffer for Lucene 2.4 + t.setTermText(aphone.toPhone(t.termText())); + return t; + } +} \ No newline at end of file Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneFr.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneFr.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneFr.java (revision 0) @@ -0,0 +1,500 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * String to phone conversion + * @Author Mathieu Lecarme + */ +public class AphoneFr extends Aphone{ + private StringBuffer phone; + private String word; + private boolean starting; + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(String word) { + if(word == null) + return null; + this.phone = new StringBuffer(); + this.word = word.toUpperCase(); + this.starting = true; + while( this.word.length() > 0 ) { + this.eat(); + this.starting = false; + } + return this.phone.toString(); + } + + private void append(String letter) { + if (letter.length() > 0) { + if (this.phone.length() > 0 && letter.charAt(0) == this.phone.charAt(this.phone.length() - 1)) { + if (letter.length() > 1) + this.phone.append(letter.substring(1)); + } else + this.phone.append(letter); + } + } + + private boolean eat() { + if( word.length() >= 3 && word.substring(0, 3).equals("AIX") && word.length() == 3) { + this.append("E"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AI") ) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("AN") && "AEUIO".indexOf(word.charAt(2)) != -1 ) { + this.append("AM"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AN") ) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("AMM") ) { + this.append("AM"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("AM") && "AEUIO".indexOf(word.charAt(2)) != -1 ) { + this.append("AM"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AM") ) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("AUD") && word.length() == 3) { + this.append("O"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("AUX") && word.length() == 3) { + this.append("O"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AU") ) { + this.append("O"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("A") ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Â") ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("À") ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("BB") ) { + this.append("P"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("B") ) { + this.append("P"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ç") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("C") && "EI".indexOf(word.charAt(1)) != -1 ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("CU") && "EI".indexOf(word.charAt(2)) != -1 ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("CC") && "EI".indexOf(word.charAt(2)) != -1 ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CC") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CH") ) { + this.append("CH"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("C") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("DD") ) { + this.append("T"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("D") ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 8 && word.substring(0, 8).equals("EMMENTAL") ) { + this.append("EMATAL"); + this.word = this.word.substring(8); + return true; + } + if( word.length() >= 9 && word.substring(0, 9).equals("EMMENTHAL") ) { + this.append("EMATAL"); + this.word = this.word.substring(9); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EM") && "AEIOU".indexOf(word.charAt(2)) != -1 ) { + this.append("EM"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EM") ) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ET") && word.length() == 2) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EUX") && word.length() == 3) { + this.append("E"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EU") ) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EN") && "AEUIO".indexOf(word.charAt(2)) != -1 ) { + this.append("EM"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EN") ) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ER") && word.length() == 2) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EO") ) { + this.append("O"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("EAUX") && word.length() == 4) { + this.append("O"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EAU") ) { + this.append("O"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("E") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("È") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("É") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ê") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("F") ) { + this.append("F"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("G") && "EIY".indexOf(word.charAt(1)) != -1 ) { + this.append("J"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("GU") && "EIY".indexOf(word.charAt(2)) != -1 ) { + this.append("G"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("G") ) { + this.append("G"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("H") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("I") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Î") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("J") ) { + this.append("J"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("KS") ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("K") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("LL") ) { + this.append("L"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("L") ) { + this.append("L"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("MM") ) { + this.append("M"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("M") ) { + this.append("M"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("NN") ) { + this.append("M"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("N") ) { + this.append("M"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("OEU") ) { + this.append("E"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("OUX") && word.length() == 3) { + this.append("U"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("OU") ) { + this.append("U"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("OÙ") ) { + this.append("U"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("O") ) { + this.append("O"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ô") ) { + this.append("O"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("PP") ) { + this.append("P"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("PH") ) { + this.append("F"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("P") ) { + this.append("P"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("QU") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Q") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("RIX") && word.length() == 3) { + this.append("RI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("RR") ) { + this.append("R"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("R") ) { + this.append("R"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("S") && word.length() == 1) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("SS") ) { + this.append("S"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("S") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("TT") ) { + this.append("T"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("T") ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("U") ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ù") ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Û") ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("V") ) { + this.append("V"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("W") ) { + this.append("W"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("X") ) { + this.append("X"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Y") && "AEOU".indexOf(word.charAt(1)) != -1 ) { + this.append("IL"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Y") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ZZ") ) { + this.append("S"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Z") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + + this.word = this.word.substring(1); + return false; + } + /** + * Simple test + */ + public static void main(String[] args) { + AphoneFr aphone = new AphoneFr(); + for(int i = 0; i < args.length; i++) { + System.out.println(aphone.toPhone(args[i])); + } + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneBg.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneBg.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneBg.java (revision 0) @@ -0,0 +1,710 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * String to phone conversion + * @Author Mathieu Lecarme + */ +public class AphoneBg extends Aphone{ + private StringBuffer phone; + private String word; + private boolean starting; + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(String word) { + if(word == null) + return null; + this.phone = new StringBuffer(); + this.word = word.toUpperCase(); + this.starting = true; + while( this.word.length() > 0 ) { + this.eat(); + this.starting = false; + } + return this.phone.toString(); + } + + private void append(String letter) { + if (letter.length() > 0) { + if (this.phone.length() > 0 && letter.charAt(0) == this.phone.charAt(this.phone.length() - 1)) { + if (letter.length() > 1) + this.phone.append(letter.substring(1)); + } else + this.phone.append(letter); + } + } + + private boolean eat() { + if( word.length() >= 1 && word.substring(0, 1).equals("А") ) { + this.append("Ъ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ъ") ) { + this.append("Ъ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("О") ) { + this.append("У"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("У") ) { + this.append("У"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Е") ) { + this.append("И"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("И") ) { + this.append("И"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ю") ) { + this.append("У"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Я") ) { + this.append("Ъ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Й") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ь") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("БСК") ) { + this.append("ПК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("БД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("П"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("БT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("П"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("БД") && word.length() == 2) { + this.append("П"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("БT") && word.length() == 2) { + this.append("П"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Б") ) { + this.append("П"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ВСК") ) { + this.append("ФК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ВД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ф"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ВT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ф"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ВД") && word.length() == 2) { + this.append("Ф"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ВT") && word.length() == 2) { + this.append("Ф"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("В") ) { + this.append("Ф"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ГСК") ) { + this.append("К"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ГД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("К"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ГT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("К"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ГД") && word.length() == 2) { + this.append("К"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ГT") && word.length() == 2) { + this.append("К"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Г") ) { + this.append("К"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ДСК") ) { + this.append("ТК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ДД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Т"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ДT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Т"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ДД") && word.length() == 2) { + this.append("Т"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ДT") && word.length() == 2) { + this.append("Т"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Д") ) { + this.append("Т"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ЖСК") ) { + this.append("ШК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЖД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ш"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЖT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ш"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЖД") && word.length() == 2) { + this.append("Ш"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЖT") && word.length() == 2) { + this.append("Ш"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ж") ) { + this.append("Ш"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ЗСК") ) { + this.append("СК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЗД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("С"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЗT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("С"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЗД") && word.length() == 2) { + this.append("С"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЗT") && word.length() == 2) { + this.append("С"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("З") ) { + this.append("С"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("КСК") ) { + this.append("К"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("КД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("К"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("КT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("К"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("КД") && word.length() == 2) { + this.append("К"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("КT") && word.length() == 2) { + this.append("К"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("К") ) { + this.append("К"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ЛСК") ) { + this.append("ЛК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЛД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Л"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЛT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Л"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЛД") && word.length() == 2) { + this.append("Л"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЛT") && word.length() == 2) { + this.append("Л"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Л") ) { + this.append("Л"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("МСК") ) { + this.append("МК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("МД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("М"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("МT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("М"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("МД") && word.length() == 2) { + this.append("М"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("МT") && word.length() == 2) { + this.append("М"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("М") ) { + this.append("М"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("НСК") ) { + this.append("НК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("НД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Н"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("НT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Н"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("НД") && word.length() == 2) { + this.append("Н"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("НT") && word.length() == 2) { + this.append("Н"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Н") ) { + this.append("Н"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ПСК") ) { + this.append("ПК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ПД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("П"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ПT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("П"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ПД") && word.length() == 2) { + this.append("П"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ПT") && word.length() == 2) { + this.append("П"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("П") ) { + this.append("П"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("РСК") ) { + this.append("РК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("РД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Р"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("РT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Р"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("РД") && word.length() == 2) { + this.append("Р"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("РT") && word.length() == 2) { + this.append("Р"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Р") ) { + this.append("Р"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ССК") ) { + this.append("СК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("СД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("С"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("СT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("С"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("СД") && word.length() == 2) { + this.append("С"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("СT") && word.length() == 2) { + this.append("С"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("С") ) { + this.append("С"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ТСК") ) { + this.append("ТК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ТД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Т"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ТT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Т"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ТД") && word.length() == 2) { + this.append("Т"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ТT") && word.length() == 2) { + this.append("Т"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Т") ) { + this.append("Т"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ФСК") ) { + this.append("ФК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ФД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ф"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ФT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ф"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ФД") && word.length() == 2) { + this.append("Ф"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ФT") && word.length() == 2) { + this.append("Ф"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ф") ) { + this.append("Ф"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ХСК") ) { + this.append("ХК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ХД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Х"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ХT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Х"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ХД") && word.length() == 2) { + this.append("Х"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ХT") && word.length() == 2) { + this.append("Х"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Х") ) { + this.append("Х"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ЦСК") ) { + this.append("ЦК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЦД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ц"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЦT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ц"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЦД") && word.length() == 2) { + this.append("Ц"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЦT") && word.length() == 2) { + this.append("Ц"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ц") ) { + this.append("Ц"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ЧСК") ) { + this.append("ЧК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЧД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ч"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ЧT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ч"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЧД") && word.length() == 2) { + this.append("Ч"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ЧT") && word.length() == 2) { + this.append("Ч"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ч") ) { + this.append("Ч"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ШСК") ) { + this.append("ШК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ШД") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ш"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ШT") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(2)) != -1 ) { + this.append("Ш"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ШД") && word.length() == 2) { + this.append("Ш"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ШT") && word.length() == 2) { + this.append("Ш"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ш") ) { + this.append("Ш"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ЩСК") ) { + this.append("ШК"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Щ") && "БВГДЖЗКЛМНПРСТГХЦЧШЩ".indexOf(word.charAt(1)) != -1 ) { + this.append("Ш"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Щ") && word.length() == 1) { + this.append("Ш"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Щ") ) { + this.append("Щ"); + this.word = this.word.substring(1); + return true; + } + + this.word = this.word.substring(1); + return false; + } + /** + * Simple test + */ + public static void main(String[] args) { + AphoneBg aphone = new AphoneBg(); + for(int i = 0; i < args.length; i++) { + System.out.println(aphone.toPhone(args[i])); + } + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneDe.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneDe.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneDe.java (revision 0) @@ -0,0 +1,2405 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * String to phone conversion + * @Author Mathieu Lecarme + */ +public class AphoneDe extends Aphone{ + private StringBuffer phone; + private String word; + private boolean starting; + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(String word) { + if(word == null) + return null; + this.phone = new StringBuffer(); + this.word = word.toUpperCase(); + this.starting = true; + while( this.word.length() > 0 ) { + this.eat(); + this.starting = false; + } + return this.phone.toString(); + } + + private void append(String letter) { + if (letter.length() > 0) { + if (this.phone.length() > 0 && letter.charAt(0) == this.phone.charAt(this.phone.length() - 1)) { + if (letter.length() > 1) + this.phone.append(letter.substring(1)); + } else + this.phone.append(letter); + } + } + + private boolean eat() { + if( word.length() >= 3 && word.substring(0, 3).equals("ÄER") ) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ÄU") ) { + this.append("EU"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ä") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("É") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ÖER") ) { + this.append("Ö"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ö") ) { + this.append("Ö"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("ÜBER") ) { + this.append("IPA"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ÜER") ) { + this.append("I"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ü") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("ß") ) { + this.append("Z"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("ABELLE") && word.length() == 6) { + this.append("APL"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("ABELL") && word.length() == 5) { + this.append("APL"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 7 && word.substring(0, 7).equals("ABIENNE") && word.length() == 7) { + this.append("APIN"); + this.word = this.word.substring(7); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ACEY") && word.length() == 4) { + this.append("AZI"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("AEU") ) { + this.append("EU"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AE") ) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("AGNI") ) { + this.append("AKN"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("AGNIE") ) { + this.append("ANI"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("AGN") && "AEOU".indexOf(word.charAt(3)) != -1 && word.length() == 4) { + this.append("ANI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("AIA") ) { + this.append("AIA"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("AIE") && word.length() == 3) { + this.append("E"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 4).equals("AILL") && "EOU".indexOf(word.charAt(4)) != -1 ) { + this.append("ALI"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("AINE") && word.length() == 4) { + this.append("EN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("AIRE") && word.length() == 4) { + this.append("ER"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("AIR") ) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("AISE") && word.length() == 4) { + this.append("EZ"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 8 && word.substring(0, 8).equals("AISSANCE") && word.length() == 8) { + this.append("EZANZ"); + this.word = this.word.substring(8); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("AISSE") && word.length() == 5) { + this.append("EZ"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("AIX") && word.length() == 3) { + this.append("EX"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("AJ") && "AÄEIOÖUÜ".indexOf(word.charAt(2)) != -1 ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("AKTIE") ) { + this.append("AXIE"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 3).equals("ALO") && "IY".indexOf(word.charAt(3)) != -1 ) { + this.append("ALUI"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 7 && word.substring(0, 6).equals("AMATEU") && "RS".indexOf(word.charAt(6)) != -1 ) { + this.append("ANATÖ"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 7 && word.substring(0, 7).equals("ANIELLE") && word.length() == 7) { + this.append("ANIL"); + this.word = this.word.substring(7); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("ANTI") ) { + this.append("ANTI"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("ANVER") ) { + this.append("ANFA"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ATIA") && word.length() == 4) { + this.append("ATIA"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 4).equals("ATIA") && "NS".indexOf(word.charAt(4)) != -1 ) { + this.append("ATI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("ATI") && "AÄOÖUÜ".indexOf(word.charAt(3)) != -1 ) { + this.append("AZI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("AUAU") ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("AUER") ) { + this.append("AUA"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("AUF") ) { + this.append("AUF"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("AULT") && word.length() == 4) { + this.append("U"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("AUSSE") && word.length() == 5) { + this.append("UZ"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 3).equals("AUS") && "ST".indexOf(word.charAt(3)) != -1 ) { + this.append("AUZ"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("AUS") ) { + this.append("AUZ"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("AUTO") ) { + this.append("AUTU"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("AUX") && "IY".indexOf(word.charAt(3)) != -1 ) { + this.append("AUX"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("AUX") ) { + this.append("U"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AU") ) { + this.append("AU"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("AVIER") && word.length() == 5) { + this.append("AFIE"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("AYER") ) { + this.append("EI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("AY") && "AÄEIOÖUÜ".indexOf(word.charAt(2)) != -1 ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("A") && "IJY".indexOf(word.charAt(1)) != -1 ) { + this.append("EI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("A") ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 3).equals("BEA") && "BCMNRU".indexOf(word.charAt(3)) != -1 ) { + this.append("PEA"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 4).equals("BEAT") && "AEIMORU".indexOf(word.charAt(4)) != -1 ) { + this.append("PEAT"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("BEIGE") && word.length() == 5) { + this.append("PEZ"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("BE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("PE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("BETTE") && word.length() == 5) { + this.append("PET"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("BIC") && word.length() == 3) { + this.append("PIZ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 4).equals("BOWL") && "EI".indexOf(word.charAt(4)) != -1 ) { + this.append("PUL"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("BP") && "AÄEIOÖRUÜY".indexOf(word.charAt(2)) != -1 ) { + this.append("P"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("BUDGET") ) { + this.append("PIKE"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("BUFFET") ) { + this.append("PIFE"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("BYLLE") && word.length() == 5) { + this.append("PILE"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("BYLL") && word.length() == 4) { + this.append("PIL"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("BYTE") ) { + this.append("PEIT"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("B") ) { + this.append("P"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CÄ") ) { + this.append("Z"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CÜ") && word.length() == 2) { + this.append("ZI"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 4).equals("CACH") && "EI".indexOf(word.charAt(4)) != -1 ) { + this.append("KEZ"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("CAE") ) { + this.append("Z"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("CA") && "IY".indexOf(word.charAt(2)) != -1 && word.length() == 3) { + this.append("ZEI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("CCH") ) { + this.append("Z"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("CCE") ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("CE") && "EIJUY".indexOf(word.charAt(2)) != -1 ) { + this.append("Z"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("CENT") ) { + this.append("ZENT"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 5).equals("CERST") && "EI".indexOf(word.charAt(5)) != -1 ) { + this.append("KE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("CER") && word.length() == 3) { + this.append("ZA"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CE") ) { + this.append("ZE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 4).equals("CHAO") && "ST".indexOf(word.charAt(4)) != -1 ) { + this.append("KAU"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 7 && word.substring(0, 7).equals("CHAMPIO") ) { + this.append("ZENPI"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 4).equals("CHAR") && "AI".indexOf(word.charAt(4)) != -1 ) { + this.append("KAR"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 4).equals("CHAU") && "CDFSVWXZ".indexOf(word.charAt(4)) != -1 ) { + this.append("ZU"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("CHE") && "CF".indexOf(word.charAt(3)) != -1 ) { + this.append("ZE"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("CHEM") ) { + this.append("KE"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("CHEQUE") ) { + this.append("ZEK"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("CHI") && "CFGPVW".indexOf(word.charAt(3)) != -1 ) { + this.append("ZI"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("CH") && "AEUY".indexOf(word.charAt(2)) != -1 ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("CHK") ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("CH") && "LOR".indexOf(word.charAt(2)) != -1 ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("CHST") ) { + this.append("X"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("CH") && "SßXZ".indexOf(word.charAt(2)) != -1 ) { + this.append("X"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CH") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("CIER") && word.length() == 4) { + this.append("ZIE"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("CYB") ) { + this.append("ZEI"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("CY") ) { + this.append("ZI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("C") && "IJY".indexOf(word.charAt(1)) != -1 ) { + this.append("Z"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("CKST") ) { + this.append("XT"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("CK") && "SßXZ".indexOf(word.charAt(2)) != -1 ) { + this.append("X"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("C") && "CK".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 7 && word.substring(0, 7).equals("CLAUDET") ) { + this.append("KLU"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 8 && word.substring(0, 8).equals("CLAUDINE") && word.length() == 8) { + this.append("KLUTIN"); + this.word = this.word.substring(8); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("COLE") && word.length() == 4) { + this.append("KUL"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("COUCH") ) { + this.append("KAUZ"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("CQUES") && word.length() == 5) { + this.append("K"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("CQUE") ) { + this.append("K"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("CREAT") ) { + this.append("KREA"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("CST") ) { + this.append("XT"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("CS") ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("C") && "SßX".indexOf(word.charAt(1)) != -1 ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("CT") && "SßXZ".indexOf(word.charAt(2)) != -1 ) { + this.append("X"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CZ") ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("C") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("D'H") ) { + this.append("T"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("D'S") && word.length() == 3) { + this.append("Z"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 4).equals("DAVO") && "NR".indexOf(word.charAt(4)) != -1 && word.length() == 5) { + this.append("TAFU"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("DD") && "SZ".indexOf(word.charAt(2)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("DEPOT") ) { + this.append("TEPU"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("DESIGN") ) { + this.append("TIZEIN"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("DE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("TE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("DETTE") && word.length() == 5) { + this.append("TET"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("DIC") && word.length() == 3) { + this.append("TIZ"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("DJ") && "AEIOU".indexOf(word.charAt(2)) != -1 ) { + this.append("I"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("DS") && "CH".indexOf(word.charAt(2)) != -1 ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("DST") ) { + this.append("ZT"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("DT") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("DUIS") ) { + this.append("TI"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("DURCH") ) { + this.append("TURK"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("DZS") && "CH".indexOf(word.charAt(3)) != -1 ) { + this.append("T"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("D") && "SßZ".indexOf(word.charAt(1)) != -1 ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("D") ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("EAULT") && word.length() == 5) { + this.append("U"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("EAUX") && word.length() == 4) { + this.append("U"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EAU") ) { + this.append("U"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EAV") ) { + this.append("IF"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EA") && "AÄEIOÖÜY".indexOf(word.charAt(2)) != -1 ) { + this.append("EA"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EA") && word.length() == 2) { + this.append("EA"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EA") ) { + this.append("I"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("EBEN") ) { + this.append("EPN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EE") ) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("EIEI") ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EIH") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("EILLE") && word.length() == 5) { + this.append("EI"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EI") ) { + this.append("EI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EJ") && word.length() == 2) { + this.append("EI"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("EL") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EL") && "DKL".indexOf(word.charAt(2)) != -1 ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EL") && "MNT".indexOf(word.charAt(2)) != -1 && word.length() == 3) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("ELYNE") && word.length() == 5) { + this.append("ELINE"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ELYN") && word.length() == 4) { + this.append("ELIN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EL") && "AÄEIOÖUÜY".indexOf(word.charAt(2)) != -1 ) { + this.append("EL"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EL") ) { + this.append("L"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("EM") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EM") && "DFKMPQT".indexOf(word.charAt(2)) != -1 ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EM") && "AÄEIOÖUÜY".indexOf(word.charAt(2)) != -1 ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EM") ) { + this.append("N"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("EN") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EN") && "CDGKQT".indexOf(word.charAt(2)) != -1 ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("ENZ") && "AEIOUY".indexOf(word.charAt(3)) != -1 ) { + this.append("EN"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EN") && "AÄEINOÖUÜY".indexOf(word.charAt(2)) != -1 ) { + this.append("EN"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EN") ) { + this.append("N"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 3).equals("ERH") && "AÄEIOÖUÜ".indexOf(word.charAt(3)) != -1 ) { + this.append("ER"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("ER") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ER") && "AÄEIOÖUÜY".indexOf(word.charAt(2)) != -1 ) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ER") && word.length() == 2) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ER") ) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("ETI") && "AÄOÖÜU".indexOf(word.charAt(3)) != -1 ) { + this.append("EZI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("EUEU") ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("EUILLE") && word.length() == 6) { + this.append("Ö"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EUR") && word.length() == 3) { + this.append("ÖR"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EUX") ) { + this.append("Ö"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("EUYS") && word.length() == 4) { + this.append("EUZ"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EU") ) { + this.append("EU"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("EYER") ) { + this.append("EIA"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EY") ) { + this.append("EI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("E") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("FANS") && word.length() == 4) { + this.append("FE"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("FAN") && word.length() == 3) { + this.append("FE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("FAULT") ) { + this.append("FUL"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("FEE") && "DL".indexOf(word.charAt(3)) != -1 ) { + this.append("FI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("FEHLER") ) { + this.append("FELA"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("FE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("FE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("FOND") ) { + this.append("FUN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("FRAIN") && word.length() == 5) { + this.append("FRA"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 7 && word.substring(0, 6).equals("FRISEU") && "RS".indexOf(word.charAt(6)) != -1 ) { + this.append("FRIZÖ"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("F") ) { + this.append("F"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("G'S") && word.length() == 3) { + this.append("X"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("GAGS") && word.length() == 4) { + this.append("KEX"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("GAG") && word.length() == 3) { + this.append("KEK"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("GD") ) { + this.append("KT"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("GEGEN") ) { + this.append("KEKN"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("GE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("KE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("GETTE") && word.length() == 5) { + this.append("KET"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("G") && "CK".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("GG") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("GI") && "AO".indexOf(word.charAt(2)) != -1 ) { + this.append("I"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("GION") && word.length() == 4) { + this.append("KIUN"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("GIUS") ) { + this.append("IU"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("GMBH") && word.length() == 4) { + this.append("GMPH"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("GNAC") && word.length() == 4) { + this.append("NIAK"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("GNON") && word.length() == 4) { + this.append("NIUN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("GN") && word.length() == 2) { + this.append("N"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("GONCAL") ) { + this.append("KUNZA"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("GS") && "CH".indexOf(word.charAt(2)) != -1 ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("GST") ) { + this.append("XT"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("G") && "SßXZ".indexOf(word.charAt(1)) != -1 ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("GUCK") ) { + this.append("KU"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("GUI") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("G") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("HEAD") ) { + this.append("E"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("HE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("HE") && "LMN".indexOf(word.charAt(2)) != -1 ) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("HEUR") && word.length() == 4) { + this.append("ÖR"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("H") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("IEC") && word.length() == 3) { + this.append("IZ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("IEI") ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("IELL") ) { + this.append("IEL"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("IENNE") && word.length() == 5) { + this.append("IN"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("IERRE") && word.length() == 5) { + this.append("IER"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("IETTE") && word.length() == 5) { + this.append("IT"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("IEU") ) { + this.append("IÖ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("IE") ) { + this.append("I"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("IGHT") && word.length() == 4) { + this.append("EIT"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 4).equals("IGNI") && "EO".indexOf(word.charAt(4)) != -1 ) { + this.append("INI"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("IGN") && "AEOU".indexOf(word.charAt(3)) != -1 && word.length() == 4) { + this.append("INI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("IJ") && "AOU".indexOf(word.charAt(2)) != -1 ) { + this.append("I"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("IJ") && word.length() == 2) { + this.append("I"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("IJ") ) { + this.append("EI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("IKOLE") && word.length() == 5) { + this.append("IKUL"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 6 && word.substring(0, 5).equals("ILLAN") && "STZ".indexOf(word.charAt(5)) != -1 ) { + this.append("ILIA"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 6 && word.substring(0, 5).equals("ILLAR") && "DT".indexOf(word.charAt(5)) != -1 ) { + this.append("ILIA"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("INVER") ) { + this.append("INFE"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("ITI") && "AÄOÖUÜ".indexOf(word.charAt(3)) != -1 ) { + this.append("IZI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("IVIER") && word.length() == 5) { + this.append("IFIE"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("I") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("JAVIE") ) { + this.append("ZA"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("JEAN") && word.length() == 4) { + this.append("IA"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("JEAN") ) { + this.append("IA"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("JER") ) { + this.append("IE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("JE") && "LMNST".indexOf(word.charAt(2)) != -1 ) { + this.append("IE"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 3).equals("JOR") && "GK".indexOf(word.charAt(3)) != -1 && word.length() == 4) { + this.append("IÖRK"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("J") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("KC") && "ÄEIJ".indexOf(word.charAt(2)) != -1 ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("KE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("KE"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("KH") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("KIC") && word.length() == 3) { + this.append("KIZ"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 3).equals("KLE") && "LMNRST".indexOf(word.charAt(3)) != -1 ) { + this.append("KLE"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("KOTELE") ) { + this.append("KUTL"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("KREAT") ) { + this.append("KREA"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("KST") ) { + this.append("XT"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("K") && "SßXZ".indexOf(word.charAt(1)) != -1 ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("KTI") && "AIOU".indexOf(word.charAt(3)) != -1 ) { + this.append("XI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("KT") && "SßXZ".indexOf(word.charAt(2)) != -1 ) { + this.append("X"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("K") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("LARVE") ) { + this.append("LARF"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("LEAND") ) { + this.append("LEAN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("LEL") ) { + this.append("LE"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("LE") && "MNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("LE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("LETTE") && word.length() == 5) { + this.append("LET"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("LFGNAG") ) { + this.append("LFKAN"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("LIC") && word.length() == 3) { + this.append("LIZ"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("LIVE") && word.length() == 4) { + this.append("LEIF"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("LUI") && "GS".indexOf(word.charAt(3)) != -1 ) { + this.append("LU"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("L") ) { + this.append("L"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 7 && word.substring(0, 6).equals("MASSEU") && "RS".indexOf(word.charAt(6)) != -1 ) { + this.append("NAZÖ"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 7 && word.substring(0, 7).equals("MAURICE") ) { + this.append("NURIZ"); + this.word = this.word.substring(7); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("MBH") && word.length() == 3) { + this.append("MPH"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("MB") && "SßZ".indexOf(word.charAt(2)) != -1 ) { + this.append("N"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("MC") ) { + this.append("NK"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("MEMOIR") ) { + this.append("NENUA"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("ME") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("NE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("MIGUEL") ) { + this.append("NIKL"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("MIKE") && word.length() == 4) { + this.append("NEIK"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("MN") ) { + this.append("N"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("MPJUTE") ) { + this.append("NPUT"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("MP") && "SßZ".indexOf(word.charAt(2)) != -1 ) { + this.append("N"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("MP") && "BDJLMNPQRTVW".indexOf(word.charAt(2)) != -1 ) { + this.append("NP"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("M") ) { + this.append("N"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("NACH") ) { + this.append("NAK"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("NADINE") ) { + this.append("NATIN"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("NAIV") ) { + this.append("NA"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("NAISE") && word.length() == 5) { + this.append("NEZE"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("NCOISE") && word.length() == 6) { + this.append("ZUA"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("NCOIS") && word.length() == 5) { + this.append("ZUA"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("NEBEN") ) { + this.append("NEPN"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("NE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("NE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("NEN") ) { + this.append("NE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("NETTE") && word.length() == 5) { + this.append("NET"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("NG") && "BDFJLMNPQRTVW".indexOf(word.charAt(2)) != -1 ) { + this.append("NK"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("NICHTS") ) { + this.append("NIX"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("NICHT") ) { + this.append("NIKT"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("NINE") && word.length() == 4) { + this.append("NIN"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("NON") ) { + this.append("NUN"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("NOT") ) { + this.append("NUT"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("NTI") && "AIOU".indexOf(word.charAt(3)) != -1 ) { + this.append("NZI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("NTIEL") ) { + this.append("NZI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("NYLON") ) { + this.append("NEILUN"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ND") && "SßZ".indexOf(word.charAt(2)) != -1 && word.length() == 3) { + this.append("NZ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("NT") && "SßZ".indexOf(word.charAt(2)) != -1 && word.length() == 3) { + this.append("NZ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ND'S") && word.length() == 4) { + this.append("NZ"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("NT'S") && word.length() == 4) { + this.append("NZ"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("NSTS") && word.length() == 4) { + this.append("NZ"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("N") ) { + this.append("N"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("OBER") ) { + this.append("UPA"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("OE") ) { + this.append("Ö"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("OGNIE") ) { + this.append("UNI"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("OGN") && "AEOU".indexOf(word.charAt(3)) != -1 && word.length() == 4) { + this.append("UNI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("OIE") && word.length() == 3) { + this.append("Ö"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("OIR") && word.length() == 3) { + this.append("UAR"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("OIX") ) { + this.append("UA"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("OI") ) { + this.append("EU"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("OJ") && "AÄEIOÖUÜ".indexOf(word.charAt(2)) != -1 ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("OKAY") && word.length() == 4) { + this.append("UKE"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("OLYN") && word.length() == 4) { + this.append("ULIN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("OTI") && "AÄOÖUÜ".indexOf(word.charAt(3)) != -1 ) { + this.append("UZI"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("OUI") ) { + this.append("FI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("OUILLE") && word.length() == 6) { + this.append("ULIE"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("OU") && "DT".indexOf(word.charAt(2)) != -1 ) { + this.append("AU"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("OUSE") && word.length() == 4) { + this.append("AUZ"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("OUT") ) { + this.append("AU"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("OU") ) { + this.append("U"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("OWS") && word.length() == 3) { + this.append("UZ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("OY") && "AÄEIOÖUÜ".indexOf(word.charAt(2)) != -1 ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("O") && "JY".indexOf(word.charAt(1)) != -1 ) { + this.append("EU"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("O") ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("PATIEN") ) { + this.append("PAZI"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("PENSIO") ) { + this.append("PANZI"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("PE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("PE"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("PFER") ) { + this.append("FE"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("P") && "FH".indexOf(word.charAt(1)) != -1 ) { + this.append("F"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("POLY") ) { + this.append("PULI"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 8 && word.substring(0, 8).equals("PORTRAIT") ) { + this.append("PURTRE"); + this.word = this.word.substring(8); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("PP") && "FH".indexOf(word.charAt(2)) != -1 ) { + this.append("P"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("PP") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("PRIX") && word.length() == 4) { + this.append("PRI"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 1).equals("P") && "SßZ".indexOf(word.charAt(1)) != -1 ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("PTI") && "AÄOÖUÜ".indexOf(word.charAt(3)) != -1 ) { + this.append("PZI"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("PIC") && word.length() == 3) { + this.append("PIK"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("P") ) { + this.append("P"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("QUE") && "LMNRST".indexOf(word.charAt(3)) != -1 ) { + this.append("KFE"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("QUE") && word.length() == 3) { + this.append("K"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("QUI") && "NS".indexOf(word.charAt(3)) != -1 && word.length() == 4) { + this.append("KI"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("QU") ) { + this.append("KF"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Q") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("RCH") ) { + this.append("RK"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 8 && word.substring(0, 8).equals("RECHERCH") ) { + this.append("REZAZ"); + this.word = this.word.substring(8); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("RER") && word.length() == 3) { + this.append("RA"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("RE") && "MNR".indexOf(word.charAt(2)) != -1 ) { + this.append("RE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("RETTE") && word.length() == 5) { + this.append("RET"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("RH") ) { + this.append("R"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("RJA") && "MN".indexOf(word.charAt(3)) != -1 ) { + this.append("RI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("RTI") && "AÄOÖUÜ".indexOf(word.charAt(3)) != -1 ) { + this.append("RZI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("RY") && "KN".indexOf(word.charAt(2)) != -1 && word.length() == 3) { + this.append("RI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("R") ) { + this.append("R"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("SAFE") && word.length() == 4) { + this.append("ZEIF"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("SAUCE") ) { + this.append("ZUZ"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("SCHSCH") ) { + this.append(""); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 7 && word.substring(0, 7).equals("SCHTSCH") ) { + this.append("Z"); + this.word = this.word.substring(7); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("SC") && "HZ".indexOf(word.charAt(2)) != -1 ) { + this.append("Z"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("SC") ) { + this.append("ZK"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 8 && word.substring(0, 8).equals("SELBSTST") ) { + this.append("ZELP"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("SELBST") ) { + this.append("ZELPZT"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 7 && word.substring(0, 7).equals("SERVICE") ) { + this.append("ZÖRFIZ"); + this.word = this.word.substring(7); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("SE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("ZE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("SETTE") && word.length() == 5) { + this.append("ZET"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("SHP") ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("SHST") ) { + this.append("ZT"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("SHTSH") ) { + this.append("Z"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("SHT") ) { + this.append("Z"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("SH") ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("SIEGLI") ) { + this.append("ZIKL"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("SIGLI") ) { + this.append("ZIKL"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("SIGHT") ) { + this.append("ZEIT"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("SIGN") ) { + this.append("ZEIN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("SKI") && "NPZ".indexOf(word.charAt(3)) != -1 ) { + this.append("ZKI"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("SKI") ) { + this.append("ZI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("SOUND") ) { + this.append("ZAUN"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("STAATS") ) { + this.append("ZTAZ"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("STADT") ) { + this.append("ZTAT"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("START") ) { + this.append("ZTART"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 8 && word.substring(0, 8).equals("STAURANT") ) { + this.append("ZTURAN"); + this.word = this.word.substring(8); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("STEAK") ) { + this.append("ZTE"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("STRAF") ) { + this.append("ZTRAF"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ST'S") && word.length() == 4) { + this.append("Z"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("STST") ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("STS") && "ACEHIOUÄÜÖ".indexOf(word.charAt(3)) != -1 ) { + this.append("ZT"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ST") && "SZ".indexOf(word.charAt(2)) != -1 ) { + this.append("Z"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 4).equals("STYN") && "AE".indexOf(word.charAt(4)) != -1 && word.length() == 5) { + this.append("ZTIN"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ST") ) { + this.append("ZT"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 3).equals("SZE") && "NPT".indexOf(word.charAt(3)) != -1 ) { + this.append("ZE"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 3).equals("SZI") && "ELN".indexOf(word.charAt(3)) != -1 ) { + this.append("ZI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("SZCZ") ) { + this.append("Z"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("SZT") ) { + this.append("ZT"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("SZ") ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("S") ) { + this.append("Z"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("T'S") && word.length() == 3) { + this.append("Z"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("TCH") ) { + this.append("Z"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("TEAT") ) { + this.append("TEA"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("TE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("TE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("TH") ) { + this.append("T"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("TIC") && word.length() == 3) { + this.append("TIZ"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("TOAS") ) { + this.append("TU"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("TOILET") ) { + this.append("TULE"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("TOIN") ) { + this.append("TUA"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("TRAINI") ) { + this.append("TREN"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("TSCH") ) { + this.append("Z"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("TSH") ) { + this.append("Z"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("TST") ) { + this.append("ZT"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("T") && "Sß".indexOf(word.charAt(1)) != -1 ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("TT") && "SZ".indexOf(word.charAt(2)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("TT") ) { + this.append("T"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("TZ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("T") ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("UEBER") ) { + this.append("IPA"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("UE") ) { + this.append("I"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("UIE") && word.length() == 3) { + this.append("I"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("UM") ) { + this.append("UN"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("UNTERE") ) { + this.append("UNTE"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("UNTER") ) { + this.append("UNTA"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("UNVER") ) { + this.append("UNFA"); + this.word = this.word.substring(5); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("UN") ) { + this.append("UN"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("UTI") && "AÄOÖUÜ".indexOf(word.charAt(3)) != -1 ) { + this.append("UZI"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("U") ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("VACL") ) { + this.append("FAZ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("VAC") && word.length() == 3) { + this.append("FAZ"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("VEDD") ) { + this.append("FE"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("VEREIN") ) { + this.append("FAEIN"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("VERSEN") ) { + this.append("FAZN"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("VER") ) { + this.append("FA"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("VER") ) { + this.append("FA"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 3).equals("VET") && "HT".indexOf(word.charAt(3)) != -1 ) { + this.append("FET"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("VETTE") && word.length() == 5) { + this.append("FET"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("VIC") && word.length() == 3) { + this.append("FIZ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("VIEL") ) { + this.append("FIL"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("VIEW") ) { + this.append("FIU"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 3).equals("VOR") ) { + this.append("FUR"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("VY") ) { + this.append("FI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("V") ) { + this.append("F"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("WE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("FE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("WIC") && word.length() == 3) { + this.append("FIZ"); + this.word = this.word.substring(3); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("WIEDER") ) { + this.append("FITA"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("WY") ) { + this.append("FI"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("W") ) { + this.append("F"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("XE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("XE"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("X") ) { + this.append("Z"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("X") && "CSZ".indexOf(word.charAt(1)) != -1 ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("XTS") && "CH".indexOf(word.charAt(3)) != -1 ) { + this.append("XT"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("XT") && "SZ".indexOf(word.charAt(2)) != -1 ) { + this.append("Z"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("X") ) { + this.append("X"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("YE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("IE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("YE") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 3).equals("YOR") && "GK".indexOf(word.charAt(3)) != -1 && word.length() == 4) { + this.append("IÖRK"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Y") && "AOU".indexOf(word.charAt(1)) != -1 ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 4 && word.substring(0, 4).equals("YVES") && word.length() == 4) { + this.append("IF"); + this.word = this.word.substring(4); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("YVONNE") && word.length() == 6) { + this.append("IFUN"); + this.word = this.word.substring(6); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Y") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ZC") && "AOU".indexOf(word.charAt(2)) != -1 ) { + this.append("ZK"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("ZE") && "LMNRST".indexOf(word.charAt(2)) != -1 ) { + this.append("ZE"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ZH") ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ZS") && "CHT".indexOf(word.charAt(2)) != -1 ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ZS") ) { + this.append("Z"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 6 && word.substring(0, 6).equals("ZUERST") ) { + this.append("ZUERZT"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("ZURÜCK") ) { + this.append("ZURIK"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("ZUVER") ) { + this.append("ZUFA"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Z") ) { + this.append("Z"); + this.word = this.word.substring(1); + return true; + } + + this.word = this.word.substring(1); + return false; + } + /** + * Simple test + */ + public static void main(String[] args) { + AphoneDe aphone = new AphoneDe(); + for(int i = 0; i < args.length; i++) { + System.out.println(aphone.toPhone(args[i])); + } + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneIs.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneIs.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneIs.java (revision 0) @@ -0,0 +1,140 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * String to phone conversion + * @Author Mathieu Lecarme + */ +public class AphoneIs extends Aphone{ + private StringBuffer phone; + private String word; + private boolean starting; + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(String word) { + if(word == null) + return null; + this.phone = new StringBuffer(); + this.word = word.toUpperCase(); + this.starting = true; + while( this.word.length() > 0 ) { + this.eat(); + this.starting = false; + } + return this.phone.toString(); + } + + private void append(String letter) { + if (letter.length() > 0) { + if (this.phone.length() > 0 && letter.charAt(0) == this.phone.charAt(this.phone.length() - 1)) { + if (letter.length() > 1) + this.phone.append(letter.substring(1)); + } else + this.phone.append(letter); + } + } + + private boolean eat() { + if( word.length() >= 1 && word.substring(0, 1).equals("S") ) { + this.append("Z"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("N") && word.length() == 1) { + this.append("NN"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("NN") && word.length() == 2) { + this.append("N"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Y") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("I") ) { + this.append("Y"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("L") ) { + this.append("LL"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("LL") ) { + this.append("L"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("FL") ) { + this.append("BL"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("LL") ) { + this.append("DL"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("RN") ) { + this.append("RDN"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("NGD") ) { + this.append("GND"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 5 && word.substring(0, 5).equals("TÖLVA") ) { + this.append("TALVA"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ý") ) { + this.append("Í"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Í") ) { + this.append("Ý"); + this.word = this.word.substring(1); + return true; + } + + this.word = this.word.substring(1); + return false; + } + /** + * Simple test + */ + public static void main(String[] args) { + AphoneIs aphone = new AphoneIs(); + for(int i = 0; i < args.length; i++) { + System.out.println(aphone.toPhone(args[i])); + } + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/Aphone.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/Aphone.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/Aphone.java (revision 0) @@ -0,0 +1,40 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Convert a word to its phonem with the aspell notation. + * @Author Mathieu Lecarme + * @see {http://en.wikipedia.org/wiki/Phonem} + * @see {http://aspell.net/} + */ +public abstract class Aphone { + /** + * @param a word + * @return phonetic transcription + */ + public abstract String toPhone(String word); + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(char[] word) { + return toPhone(new String(word)); + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneEl.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneEl.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneEl.java (revision 0) @@ -0,0 +1,485 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * String to phone conversion + * @Author Mathieu Lecarme + */ +public class AphoneEl extends Aphone{ + private StringBuffer phone; + private String word; + private boolean starting; + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(String word) { + if(word == null) + return null; + this.phone = new StringBuffer(); + this.word = word.toUpperCase(); + this.starting = true; + while( this.word.length() > 0 ) { + this.eat(); + this.starting = false; + } + return this.phone.toString(); + } + + private void append(String letter) { + if (letter.length() > 0) { + if (this.phone.length() > 0 && letter.charAt(0) == this.phone.charAt(this.phone.length() - 1)) { + if (letter.length() > 1) + this.phone.append(letter.substring(1)); + } else + this.phone.append(letter); + } + } + + private boolean eat() { + if( word.length() >= 2 && word.substring(0, 2).equals("ΒΒ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Β") ) { + this.append("Β"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΓΓ") ) { + this.append("ΓΚ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Γ") ) { + this.append("Γ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΔΔ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Δ") ) { + this.append("Δ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΖΖ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ζ") ) { + this.append("Ζ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΘΘ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Θ") ) { + this.append("Θ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΚΚ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΚΣ") ) { + this.append("Ξ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Κ") ) { + this.append("Κ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΛΛ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Λ") ) { + this.append("Λ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΜΜ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Μ") ) { + this.append("Μ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΝΝ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ν") ) { + this.append("Ν"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΠΠ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΠΣ") ) { + this.append("Ψ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Π") ) { + this.append("Π"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΡΡ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ρ") ) { + this.append("Ρ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΣΣ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Σ") ) { + this.append("Σ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΤΤ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Τ") ) { + this.append("Τ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΦΦ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Φ") ) { + this.append("Φ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΧΧ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Χ") ) { + this.append("Χ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΑΎ") ) { + this.append("ΑΥ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ΑΥΝΤ") ) { + this.append("ΑΒ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ΑΥΓΚ") ) { + this.append("ΑΒ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ΑΥΤΖ") ) { + this.append("ΑΒ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ΑΥ") && "ΓΔΖΛΜΝΡ".indexOf(word.charAt(2)) != -1 ) { + this.append("ΑΒ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ΑΥΒ") ) { + this.append("ΑΒ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ΑΥ") && "ΑΕΗΙΟΩΥΆΈΉΊΌΏΎ".indexOf(word.charAt(2)) != -1 ) { + this.append("ΑΒ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ΑΥΦ") ) { + this.append("ΑΦ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ΑΥΤΣ") ) { + this.append("ΑΦ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ΑΥ") && "ΘΚΠΣΤΧ".indexOf(word.charAt(2)) != -1 ) { + this.append("ΑΦ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ΑΥΞ") ) { + this.append("ΑΦ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ΑΥΨ") ) { + this.append("ΑΦ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Α") && "ΙΊ".indexOf(word.charAt(1)) != -1 ) { + this.append("Ε"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Α") ) { + this.append("Α"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΕΎ") ) { + this.append("Υ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ΕΥΝΤ") ) { + this.append("ΕΒ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ΕΥΓΚ") ) { + this.append("ΕΒ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ΕΥΤΖ") ) { + this.append("ΕΒ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ΕΥ") && "ΓΔΖΛΜΝΡ".indexOf(word.charAt(2)) != -1 ) { + this.append("ΕΒ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ΕΥΒ") ) { + this.append("ΕΒ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ΕΥ") && "ΑΕΗΙΟΩΥΆΈΉΊΌΏΎ".indexOf(word.charAt(2)) != -1 ) { + this.append("ΕΒ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ΕΥΦ") ) { + this.append("ΕΦ"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("ΕΥΤΣ") ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ΕΥ") && "ΘΚΠΣΤΧ".indexOf(word.charAt(2)) != -1 ) { + this.append("ΕΦ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ΕΥΞ") ) { + this.append("ΕΦ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ΕΥΨ") ) { + this.append("ΕΦ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ε") && "ΙΊ".indexOf(word.charAt(1)) != -1 ) { + this.append("Ι"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ε") ) { + this.append("Ε"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ο") && "ΙΊ".indexOf(word.charAt(1)) != -1 ) { + this.append("Ι"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Ο") && "ΥΎ".indexOf(word.charAt(1)) != -1 ) { + this.append("ΟΥ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ο") ) { + this.append("Ο"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ω") ) { + this.append("Ο"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Η") ) { + this.append("Ι"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΥΙ") ) { + this.append("Ι"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Υ") ) { + this.append("Ι"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ι") ) { + this.append("Ι"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΞΞ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΞΣ") ) { + this.append("Ξ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ξ") ) { + this.append("Ξ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΨΨ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ΨΣ") ) { + this.append("Ψ"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ψ") ) { + this.append("Ψ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ϊ") ) { + this.append("Ι"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ϋ") ) { + this.append("Ι"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ά") ) { + this.append("Α"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ό") ) { + this.append("Ο"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ί") ) { + this.append("Ι"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Έ") ) { + this.append("Ε"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ύ") ) { + this.append("Υ"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ώ") ) { + this.append("Ο"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ή") ) { + this.append("Ι"); + this.word = this.word.substring(1); + return true; + } + + this.word = this.word.substring(1); + return false; + } + /** + * Simple test + */ + public static void main(String[] args) { + AphoneEl aphone = new AphoneEl(); + for(int i = 0; i < args.length; i++) { + System.out.println(aphone.toPhone(args[i])); + } + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/Homophone.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/Homophone.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/Homophone.java (revision 0) @@ -0,0 +1,104 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.Map.Entry; + +import org.apache.lucene.search.spell.Dictionary; +import org.apache.lucene.search.spell.PlainTextDictionary; + +/** + * Homophone classification of a list of words + * @see {http://en.wikipedia.org/wiki/Homophone} + * @Author Mathieu Lecarme + */ +public class Homophone { + private Aphone aphone; + private Map dict = new TreeMap(); + public void setAphone(Aphone a) { + aphone = a; + } + + /** + * Initilisation with a specific language + */ + public Homophone(Aphone aphone) { + setAphone(aphone); + } + + /** + * Add a word for sorting + */ + public void addWord(String word) { + String phonem = aphone.toPhone(word); + System.out.println(word + " -> " + phonem); + if(! dict.containsKey(phonem)) { + dict.put(phonem, new HashSet()); + } + ((Set)dict.get(phonem)).add(word); + } + + /** + * Add a full dictionary + */ + public void read(Dictionary dico) { + Iterator iter = dico.getWordsIterator(); + while(iter.hasNext()) + addWord((String)iter.next()); + } + + /** + * Show what is inside + */ + public void dump() { + Iterator iter = dict.entrySet().iterator(); + while(iter.hasNext()) { + Entry line = (Entry)iter.next(); + System.out.println(line.getKey()); + System.out.print(line.getValue()); + System.out.print("\n"); + } + } + + /** + * Simple test + */ + public static void main(String[] args) { + Map languages = new HashMap(); + languages.put("fr", new AphoneFr()); + languages.put("en", new AphoneEn()); + Homophone homophone = new Homophone((Aphone)languages.get(args[0])); + Dictionary d = null; + try { + d = new PlainTextDictionary(new File(args[1])); + } catch (FileNotFoundException e) { + } + if(d != null) { + homophone.read(d); + homophone.dump(); + } + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneEn.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneEn.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneEn.java (revision 0) @@ -0,0 +1,595 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * String to phone conversion + * @Author Mathieu Lecarme + */ +public class AphoneEn extends Aphone{ + private StringBuffer phone; + private String word; + private boolean starting; + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(String word) { + if(word == null) + return null; + this.phone = new StringBuffer(); + this.word = word.toUpperCase(); + this.starting = true; + while( this.word.length() > 0 ) { + this.eat(); + this.starting = false; + } + return this.phone.toString(); + } + + private void append(String letter) { + if (letter.length() > 0) { + if (this.phone.length() > 0 && letter.charAt(0) == this.phone.charAt(this.phone.length() - 1)) { + if (letter.length() > 1) + this.phone.append(letter.substring(1)); + } else + this.phone.append(letter); + } + } + + private boolean eat() { + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("AH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("*H"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("AR") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("*R"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 1).equals("A") && "HR".indexOf(word.charAt(1)) != -1 ) { + this.append("*"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("A") ) { + this.append("*"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("AH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("H"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("AR") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("R"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("A") && "HR".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("BB") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("B") ) { + this.append("B"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CQ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("CIA") ) { + this.append("X"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CH") ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("C") && "EIY".indexOf(word.charAt(1)) != -1 ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CK") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("COUGH") ) { + this.append("KF"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CC") ) { + this.append("C"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("C") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("DG") && "EIY".indexOf(word.charAt(2)) != -1 ) { + this.append("K"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("DD") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("D") ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("É") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("EH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("*H"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("ER") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("*R"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 1).equals("E") && "HR".indexOf(word.charAt(1)) != -1 ) { + this.append("*"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 6 && word.substring(0, 6).equals("ENOUGH") && word.length() == 6) { + this.append("*NF"); + this.word = this.word.substring(6); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("E") ) { + this.append("*"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("H"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("ER") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("R"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("E") && "HR".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("FF") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("F") ) { + this.append("F"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("GN") ) { + this.append("N"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("GN") && word.length() == 2) { + this.append("N"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("GNS") && word.length() == 3) { + this.append("NS"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 4 && word.substring(0, 4).equals("GNED") && word.length() == 4) { + this.append("N"); + this.word = this.word.substring(4); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("GH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("GH") ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("GG") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("G") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("H") ) { + this.append("H"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("IH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("*H"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("IR") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("*R"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 1).equals("I") && "HR".indexOf(word.charAt(1)) != -1 ) { + this.append("*"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("I") ) { + this.append("*"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("ING") ) { + this.append("N"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("IH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("H"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("IR") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("R"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("I") && "HR".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("J") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("KN") ) { + this.append("N"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("KK") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("K") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("LAUGH") ) { + this.append("LF"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("LL") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("L") ) { + this.append("L"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("MB") && word.length() == 2) { + this.append("M"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("MM") ) { + this.append("M"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("M") ) { + this.append("M"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("NN") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("N") ) { + this.append("N"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("OH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("*H"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("OR") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("*R"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 1).equals("O") && "HR".indexOf(word.charAt(1)) != -1 ) { + this.append("*"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("O") ) { + this.append("*"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("OH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("H"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("OR") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("R"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("O") && "HR".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("PH") ) { + this.append("F"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("PN") ) { + this.append("N"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("PP") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("P") ) { + this.append("P"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Q") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("RH") ) { + this.append("R"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("ROUGH") ) { + this.append("RF"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("RR") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("R") ) { + this.append("R"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 4 && word.substring(0, 3).equals("SCH") && "EOU".indexOf(word.charAt(3)) != -1 ) { + this.append("SK"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("SC") && "IEY".indexOf(word.charAt(2)) != -1 ) { + this.append("S"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("SH") ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("SI") && "AO".indexOf(word.charAt(2)) != -1 ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("SS") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("S") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("TI") && "AO".indexOf(word.charAt(2)) != -1 ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("TH") ) { + this.append("@"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("TCH") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 5 && word.substring(0, 5).equals("TOUGH") ) { + this.append("TF"); + this.word = this.word.substring(5); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("TT") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("T") ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("UH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("*H"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 3 && word.substring(0, 2).equals("UR") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("*R"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 1).equals("U") && "HR".indexOf(word.charAt(1)) != -1 ) { + this.append("*"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("U") ) { + this.append("*"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("UH") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("H"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("UR") && "AEIOUY".indexOf(word.charAt(2)) != -1 ) { + this.append("R"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("U") && "HR".indexOf(word.charAt(1)) != -1 ) { + this.append(""); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("V") ) { + this.append("W"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("V") ) { + this.append("F"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("WR") ) { + this.append("R"); + this.word = this.word.substring(2); + return true; + } + if( this.starting && word.length() >= 2 && word.substring(0, 2).equals("WH") ) { + this.append("W"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("W") && "AEIOU".indexOf(word.charAt(1)) != -1 ) { + this.append("W"); + this.word = this.word.substring(1); + return true; + } + if( this.starting && word.length() >= 1 && word.substring(0, 1).equals("X") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("X") ) { + this.append("KS"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Y") && "AEIOU".indexOf(word.charAt(1)) != -1 ) { + this.append("Y"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ZZ") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Z") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + + this.word = this.word.substring(1); + return false; + } + /** + * Simple test + */ + public static void main(String[] args) { + AphoneEn aphone = new AphoneEn(); + for(int i = 0; i < args.length; i++) { + System.out.println(aphone.toPhone(args[i])); + } + } +} Index: contrib/aphone/src/java/org/apache/lucene/aphone/AphoneBr.java =================================================================== --- contrib/aphone/src/java/org/apache/lucene/aphone/AphoneBr.java (revision 0) +++ contrib/aphone/src/java/org/apache/lucene/aphone/AphoneBr.java (revision 0) @@ -0,0 +1,465 @@ +package org.apache.lucene.aphone; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * String to phone conversion + * @Author Mathieu Lecarme + */ +public class AphoneBr extends Aphone{ + private StringBuffer phone; + private String word; + private boolean starting; + + /** + * @param a word + * @return phonetic transcription + */ + public String toPhone(String word) { + if(word == null) + return null; + this.phone = new StringBuffer(); + this.word = word.toUpperCase(); + this.starting = true; + while( this.word.length() > 0 ) { + this.eat(); + this.starting = false; + } + return this.phone.toString(); + } + + private void append(String letter) { + if (letter.length() > 0) { + if (this.phone.length() > 0 && letter.charAt(0) == this.phone.charAt(this.phone.length() - 1)) { + if (letter.length() > 1) + this.phone.append(letter.substring(1)); + } else + this.phone.append(letter); + } + } + + private boolean eat() { + if( word.length() >= 2 && word.substring(0, 2).equals("AI") ) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("AN") && "AEUIO".indexOf(word.charAt(2)) != -1 ) { + this.append("AM"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AN") ) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("AMM") ) { + this.append("AM"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("AM") && "AEUIO".indexOf(word.charAt(2)) != -1 ) { + this.append("AM"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AM") ) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("AU") ) { + this.append("O"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("A") ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Â") ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("À") ) { + this.append("A"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("BB") ) { + this.append("P"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("B") ) { + this.append("P"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ç") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("C") && "EI".indexOf(word.charAt(1)) != -1 ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("CU") && "EI".indexOf(word.charAt(2)) != -1 ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("CC") && "EI".indexOf(word.charAt(2)) != -1 ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CC") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("CH") ) { + this.append("CH"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("C") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("DD") ) { + this.append("T"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("D") ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 8 && word.substring(0, 8).equals("EMMENTAL") ) { + this.append("EMATAL"); + this.word = this.word.substring(8); + return true; + } + if( word.length() >= 9 && word.substring(0, 9).equals("EMMENTHAL") ) { + this.append("EMATAL"); + this.word = this.word.substring(9); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EM") && "AEIOU".indexOf(word.charAt(2)) != -1 ) { + this.append("EM"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EM") ) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ET") && word.length() == 2) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EU") ) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("EN") && "AEUIO".indexOf(word.charAt(2)) != -1 ) { + this.append("EM"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EN") ) { + this.append("A"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ER") && word.length() == 2) { + this.append("E"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("EO") ) { + this.append("O"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("EAU") ) { + this.append("O"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("E") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("È") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("É") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ê") ) { + this.append("E"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("F") ) { + this.append("F"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("G") && "EIY".indexOf(word.charAt(1)) != -1 ) { + this.append("J"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 2).equals("GU") && "EIY".indexOf(word.charAt(2)) != -1 ) { + this.append("G"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("G") ) { + this.append("G"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("H") ) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("I") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Î") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("J") ) { + this.append("J"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("KS") ) { + this.append("X"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("K") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("LL") ) { + this.append("L"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("L") ) { + this.append("L"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("MM") ) { + this.append("M"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("M") ) { + this.append("M"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("NN") ) { + this.append("M"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("N") ) { + this.append("M"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 3 && word.substring(0, 3).equals("OEU") ) { + this.append("E"); + this.word = this.word.substring(3); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("OU") ) { + this.append("U"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("OÙ") ) { + this.append("U"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("O") ) { + this.append("O"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ô") ) { + this.append("O"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("PP") ) { + this.append("P"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("PH") ) { + this.append("F"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("P") ) { + this.append("P"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("QU") ) { + this.append("K"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Q") ) { + this.append("K"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("RR") ) { + this.append("R"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("R") ) { + this.append("R"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("S") && word.length() == 1) { + this.append(""); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("SS") ) { + this.append("S"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("S") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("TT") ) { + this.append("T"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("T") ) { + this.append("T"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("U") ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Ù") ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Û") ) { + this.append("U"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("V") ) { + this.append("V"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("W") ) { + this.append("W"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("X") ) { + this.append("X"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 1).equals("Y") && "AEOU".indexOf(word.charAt(1)) != -1 ) { + this.append("IL"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Y") ) { + this.append("I"); + this.word = this.word.substring(1); + return true; + } + if( word.length() >= 2 && word.substring(0, 2).equals("ZZ") ) { + this.append("S"); + this.word = this.word.substring(2); + return true; + } + if( word.length() >= 1 && word.substring(0, 1).equals("Z") ) { + this.append("S"); + this.word = this.word.substring(1); + return true; + } + + this.word = this.word.substring(1); + return false; + } + /** + * Simple test + */ + public static void main(String[] args) { + AphoneBr aphone = new AphoneBr(); + for(int i = 0; i < args.length; i++) { + System.out.println(aphone.toPhone(args[i])); + } + } +} Index: contrib/aphone/pom.xml =================================================================== Index: contrib/aphone/build.xml =================================================================== --- contrib/aphone/build.xml (revision 0) +++ contrib/aphone/build.xml (revision 0) @@ -0,0 +1,68 @@ + + + + + + + + Aphone + + + + + + + + + + + + + + + + XML Parser building dependency ${spellchecker.jar} + + + + + + + Must specify 'list' property. + + + + + + + + + + + + + + + + + +