Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (revision 0) @@ -0,0 +1,117 @@ +package org.apache.lucene.analysis.fa; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.analysis.ar.ArabicLetterTokenizer; +import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; + +/** + * Analyzer for Persian. + * + * Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around ZWNJ in addition to space. + * Some persian-specific variant forms (such as farsi yeh and keheh) are standardized. + * "Stemming" is accomplished via stopwords. + * + */ +public final class PersianAnalyzer extends Analyzer { + + /** + * File containing default Persian stopwords. + * + * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html + * The stopword list is BSD-Licensed. + * + */ + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + /** + * Contains the stopwords used with the StopFilter. + */ + private Set stoptable = new HashSet(); + /** + * The comment character in the stopwords file. All lines prefixed with this will be ignored + */ + public static final String STOPWORDS_COMMENT = "#"; + + /** + * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. + */ + public PersianAnalyzer() { + try { + InputStream stream = PersianAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE); + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT); + reader.close(); + stream.close(); + } catch (IOException e) { + // TODO: throw IOException + throw new RuntimeException(e); + } + } + + /** + * Builds an analyzer with the given stop words. + */ + public PersianAnalyzer( String[] stopwords ) { + stoptable = StopFilter.makeStopSet( stopwords ); + } + + /** + * Builds an analyzer with the given stop words. + */ + public PersianAnalyzer( Hashtable stopwords ) { + stoptable = new HashSet(stopwords.keySet()); + } + + /** + * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT} + */ + public PersianAnalyzer( File stopwords ) throws IOException { + stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT); + } + + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a ArabicLetterTokenizer filtered with + * ArabicNormalizationFilter, PersianNormalizationFilter and Persian Stop words + */ + public final TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new ArabicLetterTokenizer( reader ); + result = new ArabicNormalizationFilter( result ); + result = new PersianNormalizationFilter( result ); // additional persian-specific normalization + result = new StopFilter( result, stoptable ); + + return result; + } +} + Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java (revision 0) @@ -0,0 +1,53 @@ +package org.apache.lucene.analysis.fa; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +/** + * A TokenFilter that applies {@link PersianNormalizer} to normalize the orthography. + * + */ + +public class PersianNormalizationFilter extends TokenFilter { + + protected PersianNormalizer normalizer = null; + + public PersianNormalizationFilter(TokenStream input) { + super(input); + normalizer = new PersianNormalizer(); + } + + + + public Token next(Token reusableToken) throws IOException { + if ((reusableToken = input.next(reusableToken)) == null) { + return null; + } else { + int oldlen = reusableToken.termLength(); + int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen); + if (oldlen != newlen) + reusableToken.setTermLength(newlen); + return reusableToken; + } + } +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java (revision 0) @@ -0,0 +1,91 @@ +package org.apache.lucene.analysis.fa; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Normalizer for Persian. + *

+ * Normalization is done in-place for efficiency, operating on a termbuffer. + *

+ * Normalization is defined as: + *

+ * + */ +public class PersianNormalizer { + public static final char YEH = '\u064A'; + public static final char FARSI_YEH = '\u06CC'; + public static final char YEH_BARREE = '\u06D2'; + + public static final char KEHEH = '\u06A9'; + public static final char KAF = '\u0643'; + + public static final char HAMZA_ABOVE = '\u0654'; + public static final char HEH_YEH = '\u06C0'; + public static final char HEH_GOAL = '\u06C1'; + public static final char HEH = '\u0647'; + + + /** + * Normalize an input buffer of Persian text + * + * @param s input buffer + * @param len length of input buffer + * @return length of input buffer after normalization + */ + public int normalize(char s[], int len) { + + for (int i = 0; i < len; i++) { + if (s[i] == FARSI_YEH || s[i] == YEH_BARREE) + s[i] = YEH; + + if (s[i] == KEHEH) + s[i] = KAF; + + if (s[i] == HEH_YEH || s[i] == HEH_GOAL) + s[i] = HEH; + + if (s[i] == HAMZA_ABOVE) { // necessary for HEH + HAMZA + len = delete(s, i, len); + i--; + } + } + + return len; + } + + /** + * Delete a character in-place + * + * @param s Input Buffer + * @param pos Position of character to delete + * @param len length of input buffer + * @return length of input buffer after deletion + */ + protected int delete(char s[], int pos, int len) { + if (pos < len) + System.arraycopy(s, pos + 1, s, pos, len - pos - 1); + + return len - 1; + } + + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fa/package.html =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/fa/package.html (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/fa/package.html (revision 0) @@ -0,0 +1,5 @@ + + +Analyzer for Persian. + + Index: contrib/analyzers/src/resources/org/apache/lucene/analysis/fa/stopwords.txt =================================================================== --- contrib/analyzers/src/resources/org/apache/lucene/analysis/fa/stopwords.txt (revision 0) +++ contrib/analyzers/src/resources/org/apache/lucene/analysis/fa/stopwords.txt (revision 0) @@ -0,0 +1,311 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +انان +نداشته +سراسر +خياه +ايشان +وي +تاكنون +بيشتري +دوم +پس +ناشي +وگو +يا +داشتند +سپس +هنگام +هرگز +پنج +نشان +امسال +ديگر +گروهي +شدند +چطور +ده +و +دو +نخستين +ولي +چرا +چه +وسط +ه +كدام +قابل +يك +رفت +هفت +همچنين +در +هزار +بله +بلي +شايد +اما +شناسي +گرفته +دهد +داشته +دانست +داشتن +خواهيم +ميليارد +وقتيكه +امد +خواهد +جز +اورده +شده +بلكه +خدمات +شدن +برخي +نبود +بسياري +جلوگيري +حق +كردند +نوعي +بعري +نكرده +نظير +نبايد +بوده +بودن +داد +اورد +هست +جايي +شود +دنبال +داده +بايد +سابق +هيچ +همان +انجا +كمتر +كجاست +گردد +كسي +تر +مردم +تان +دادن +بودند +سري +جدا +ندارند +مگر +يكديگر +دارد +دهند +بنابراين +هنگامي +سمت +جا +انچه +خود +دادند +زياد +دارند +اثر +بدون +بهترين +بيشتر +البته +به +براساس +بيرون +كرد +بعضي +گرفت +توي +اي +ميليون +او +جريان +تول +بر +مانند +برابر +باشيم +مدتي +گويند +اكنون +تا +تنها +جديد +چند +بي +نشده +كردن +كردم +گويد +كرده +كنيم +نمي +نزد +روي +قصد +فقط +بالاي +ديگران +اين +ديروز +توسط +سوم +ايم +دانند +سوي +استفاده +شما +كنار +داريم +ساخته +طور +امده +رفته +نخست +بيست +نزديك +طي +كنيد +از +انها +تمامي +داشت +يكي +طريق +اش +چيست +روب +نمايد +گفت +چندين +چيزي +تواند +ام +ايا +با +ان +ايد +ترين +اينكه +ديگري +راه +هايي +بروز +همچنان +پاعين +كس +حدود +مختلف +مقابل +چيز +گيرد +ندارد +ضد +همچون +سازي +شان +مورد +باره +مرسي +خويش +برخوردار +چون +خارج +شش +هنوز +تحت +ضمن +هستيم +گفته +فكر +بسيار +پيش +براي +روزهاي +انكه +نخواهد +بالا +كل +وقتي +كي +چنين +كه +گيري +نيست +است +كجا +كند +نيز +يابد +بندي +حتي +توانند +عقب +خواست +كنند +بين +تمام +همه +ما +باشند +مثل +شد +اري +باشد +اره +طبق +بعد +اگر +صورت +غير +جاي +بيش +ريزي +اند +زيرا +چگونه +بار +لطفا +مي +درباره +من +ديده +همين +گذاري +برداري +علت +گذاشته +هم +فوق +نه +ها +شوند +اباد +همواره +هر +اول +خواهند +چهار +نام +امروز +مان +هاي +قبل +كنم +سعي +تازه +را +هستند +زير +جلوي +عنوان +بود Index: contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (revision 0) @@ -0,0 +1,210 @@ +package org.apache.lucene.analysis.fa; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +/** + * Test the Persian Analyzer + * + */ +public class TestPersianAnalyzer extends TestCase { + + /** This test fails with NPE when the + * stopwords file is missing in classpath */ + public void testResourcesAvailable() { + new PersianAnalyzer(); + } + + /** + * This test shows how the combination of tokenization (breaking on zero-width non-joiner), + * normalization (such as treating arabic YEH and farsi YEH the same), + * and stopwords creates a light-stemming effect for verbs. + * + * These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar + */ + public void testBehaviorVerbs() throws Exception { + Analyzer a = new PersianAnalyzer(); + // active present indicative + assertAnalyzesTo(a, "می‌خورد", new String[] { "خورد" }); + // active preterite indicative + assertAnalyzesTo(a, "خورد", new String[] { "خورد" }); + // active imperfective preterite indicative + assertAnalyzesTo(a, "می‌خورد", new String[] { "خورد" }); + // active future indicative + assertAnalyzesTo(a, "خواهد خورد", new String[] { "خورد" }); + // active present progressive indicative + assertAnalyzesTo(a, "دارد می‌خورد", new String[] { "خورد" }); + // active preterite progressive indicative + assertAnalyzesTo(a, "داشت می‌خورد", new String[] { "خورد" }); + + // active perfect indicative + assertAnalyzesTo(a, "خورده‌است", new String[] { "خورده" }); + // active imperfective perfect indicative + assertAnalyzesTo(a, "می‌خورده‌است", new String[] { "خورده" }); + // active pluperfect indicative + assertAnalyzesTo(a, "خورده بود", new String[] { "خورده" }); + // active imperfective pluperfect indicative + assertAnalyzesTo(a, "می‌خورده بود", new String[] { "خورده" }); + // active preterite subjunctive + assertAnalyzesTo(a, "خورده باشد", new String[] { "خورده" }); + // active imperfective preterite subjunctive + assertAnalyzesTo(a, "می‌خورده باشد", new String[] { "خورده" }); + // active pluperfect subjunctive + assertAnalyzesTo(a, "خورده بوده باشد", new String[] { "خورده" }); + // active imperfective pluperfect subjunctive + assertAnalyzesTo(a, "می‌خورده بوده باشد", new String[] { "خورده" }); + // passive present indicative + assertAnalyzesTo(a, "خورده می‌شود", new String[] { "خورده" }); + // passive preterite indicative + assertAnalyzesTo(a, "خورده شد", new String[] { "خورده" }); + // passive imperfective preterite indicative + assertAnalyzesTo(a, "خورده می‌شد", new String[] { "خورده" }); + // passive perfect indicative + assertAnalyzesTo(a, "خورده شده‌است", new String[] { "خورده" }); + // passive imperfective perfect indicative + assertAnalyzesTo(a, "خورده می‌شده‌است", new String[] { "خورده" }); + // passive pluperfect indicative + assertAnalyzesTo(a, "خورده شده بود", new String[] { "خورده" }); + // passive imperfective pluperfect indicative + assertAnalyzesTo(a, "خورده می‌شده بود", new String[] { "خورده" }); + // passive future indicative + assertAnalyzesTo(a, "خورده خواهد شد", new String[] { "خورده" }); + // passive present progressive indicative + assertAnalyzesTo(a, "دارد خورده می‌شود", new String[] { "خورده" }); + // passive preterite progressive indicative + assertAnalyzesTo(a, "داشت خورده می‌شد", new String[] { "خورده" }); + // passive present subjunctive + assertAnalyzesTo(a, "خورده شود", new String[] { "خورده" }); + // passive preterite subjunctive + assertAnalyzesTo(a, "خورده شده باشد", new String[] { "خورده" }); + // passive imperfective preterite subjunctive + assertAnalyzesTo(a, "خورده می‌شده باشد", new String[] { "خورده" }); + // passive pluperfect subjunctive + assertAnalyzesTo(a, "خورده شده بوده باشد", new String[] { "خورده" }); + // passive imperfective pluperfect subjunctive + assertAnalyzesTo(a, "خورده می‌شده بوده باشد", new String[] { "خورده" }); + + // active present subjunctive + assertAnalyzesTo(a, "بخورد", new String[] { "بخورد" }); + } + + /** + * This test shows how the combination of tokenization and stopwords creates a light-stemming effect for verbs. + * + * In this case, these forms are presented with alternative orthography, using arabic yeh and whitespace. + * This yeh phenomenon is common for legacy text due to some previous bugs in Microsoft Windows. + * + * These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar + */ + public void testBehaviorVerbsDefective() throws Exception { + Analyzer a = new PersianAnalyzer(); + // active present indicative + assertAnalyzesTo(a, "مي خورد", new String[] { "خورد" }); + // active preterite indicative + assertAnalyzesTo(a, "خورد", new String[] { "خورد" }); + // active imperfective preterite indicative + assertAnalyzesTo(a, "مي خورد", new String[] { "خورد" }); + // active future indicative + assertAnalyzesTo(a, "خواهد خورد", new String[] { "خورد" }); + // active present progressive indicative + assertAnalyzesTo(a, "دارد مي خورد", new String[] { "خورد" }); + // active preterite progressive indicative + assertAnalyzesTo(a, "داشت مي خورد", new String[] { "خورد" }); + + // active perfect indicative + assertAnalyzesTo(a, "خورده است", new String[] { "خورده" }); + // active imperfective perfect indicative + assertAnalyzesTo(a, "مي خورده است", new String[] { "خورده" }); + // active pluperfect indicative + assertAnalyzesTo(a, "خورده بود", new String[] { "خورده" }); + // active imperfective pluperfect indicative + assertAnalyzesTo(a, "مي خورده بود", new String[] { "خورده" }); + // active preterite subjunctive + assertAnalyzesTo(a, "خورده باشد", new String[] { "خورده" }); + // active imperfective preterite subjunctive + assertAnalyzesTo(a, "مي خورده باشد", new String[] { "خورده" }); + // active pluperfect subjunctive + assertAnalyzesTo(a, "خورده بوده باشد", new String[] { "خورده" }); + // active imperfective pluperfect subjunctive + assertAnalyzesTo(a, "مي خورده بوده باشد", new String[] { "خورده" }); + // passive present indicative + assertAnalyzesTo(a, "خورده مي شود", new String[] { "خورده" }); + // passive preterite indicative + assertAnalyzesTo(a, "خورده شد", new String[] { "خورده" }); + // passive imperfective preterite indicative + assertAnalyzesTo(a, "خورده مي شد", new String[] { "خورده" }); + // passive perfect indicative + assertAnalyzesTo(a, "خورده شده است", new String[] { "خورده" }); + // passive imperfective perfect indicative + assertAnalyzesTo(a, "خورده مي شده است", new String[] { "خورده" }); + // passive pluperfect indicative + assertAnalyzesTo(a, "خورده شده بود", new String[] { "خورده" }); + // passive imperfective pluperfect indicative + assertAnalyzesTo(a, "خورده مي شده بود", new String[] { "خورده" }); + // passive future indicative + assertAnalyzesTo(a, "خورده خواهد شد", new String[] { "خورده" }); + // passive present progressive indicative + assertAnalyzesTo(a, "دارد خورده مي شود", new String[] { "خورده" }); + // passive preterite progressive indicative + assertAnalyzesTo(a, "داشت خورده مي شد", new String[] { "خورده" }); + // passive present subjunctive + assertAnalyzesTo(a, "خورده شود", new String[] { "خورده" }); + // passive preterite subjunctive + assertAnalyzesTo(a, "خورده شده باشد", new String[] { "خورده" }); + // passive imperfective preterite subjunctive + assertAnalyzesTo(a, "خورده مي شده باشد", new String[] { "خورده" }); + // passive pluperfect subjunctive + assertAnalyzesTo(a, "خورده شده بوده باشد", new String[] { "خورده" }); + // passive imperfective pluperfect subjunctive + assertAnalyzesTo(a, "خورده مي شده بوده باشد", new String[] { "خورده" }); + + // active present subjunctive + assertAnalyzesTo(a, "بخورد", new String[] { "بخورد" }); + } + + /** + * This test shows how the combination of tokenization (breaking on zero-width non-joiner or space) + * and stopwords creates a light-stemming effect for nouns, removing the plural -ha. + */ + public void testBehaviorNouns() throws Exception { + Analyzer a = new PersianAnalyzer(); + assertAnalyzesTo(a, "برگ ها", new String[] { "برگ" }); + assertAnalyzesTo(a, "برگ‌ها", new String[] { "برگ" }); + } + + + public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception { + TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + final Token reusableToken = new Token(); + for (int i = 0; i < output.length; i++) { + Token nextToken = ts.next(reusableToken); + assertNotNull(nextToken); + assertEquals(nextToken.term(), output[i]); + } + assertNull(ts.next(reusableToken)); + ts.close(); + } + +} Index: contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java (revision 0) @@ -0,0 +1,75 @@ +package org.apache.lucene.analysis.fa; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.ar.ArabicLetterTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Test the Arabic Normalization Filter + * + */ +public class TestPersianNormalizationFilter extends TestCase { + + public void testFarsiYeh() throws IOException { + check("های", "هاي"); + } + + public void testYehBarree() throws IOException { + check("هاے", "هاي"); + } + + public void testKeheh() throws IOException { + check("کشاندن", "كشاندن"); + } + + public void testHehYeh() throws IOException { + check("كتابۀ", "كتابه"); + } + + public void testHehHamzaAbove() throws IOException { + check("كتابهٔ", "كتابه"); + } + + public void testHehGoal() throws IOException { + check("زادہ", "زاده"); + } + + private void check(final String input, final String expected) throws IOException { + ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); + PersianNormalizationFilter filter = new PersianNormalizationFilter(tokenStream); + final Token reusableToken = new Token(); + Token nextToken = filter.next(reusableToken); + if (nextToken == null) + fail(); + assertEquals(expected, nextToken.term()); + filter.close(); + } + +}