Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (revision 0)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java (revision 0)
@@ -0,0 +1,117 @@
+package org.apache.lucene.analysis.fa;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
+import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+
+/**
+ * Analyzer for Persian.
+ *
+ * Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around ZWNJ in addition to space.
+ * Some persian-specific variant forms (such as farsi yeh and keheh) are standardized.
+ * "Stemming" is accomplished via stopwords.
+ *
+ */
+public final class PersianAnalyzer extends Analyzer {
+
+ /**
+ * File containing default Persian stopwords.
+ *
+ * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+ * The stopword list is BSD-Licensed.
+ *
+ */
+ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+ /**
+ * Contains the stopwords used with the StopFilter.
+ */
+ private Set stoptable = new HashSet();
+ /**
+ * The comment character in the stopwords file. All lines prefixed with this will be ignored
+ */
+ public static final String STOPWORDS_COMMENT = "#";
+
+ /**
+ * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+ */
+ public PersianAnalyzer() {
+ try {
+ InputStream stream = PersianAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE);
+ InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+ stoptable = WordlistLoader.getWordSet(reader, STOPWORDS_COMMENT);
+ reader.close();
+ stream.close();
+ } catch (IOException e) {
+ // TODO: throw IOException
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public PersianAnalyzer( String[] stopwords ) {
+ stoptable = StopFilter.makeStopSet( stopwords );
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ */
+ public PersianAnalyzer( Hashtable stopwords ) {
+ stoptable = new HashSet(stopwords.keySet());
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT}
+ */
+ public PersianAnalyzer( File stopwords ) throws IOException {
+ stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
+ }
+
+
+ /**
+ * Creates a TokenStream which tokenizes all the text in the provided Reader.
+ *
+ * @return A TokenStream build from a ArabicLetterTokenizer filtered with
+ * ArabicNormalizationFilter, PersianNormalizationFilter and Persian Stop words
+ */
+ public final TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream result = new ArabicLetterTokenizer( reader );
+ result = new ArabicNormalizationFilter( result );
+ result = new PersianNormalizationFilter( result ); // additional persian-specific normalization
+ result = new StopFilter( result, stoptable );
+
+ return result;
+ }
+}
+
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java (revision 0)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilter.java (revision 0)
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.fa;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenFilter that applies {@link PersianNormalizer} to normalize the orthography.
+ *
+ */
+
+public class PersianNormalizationFilter extends TokenFilter {
+
+ protected PersianNormalizer normalizer = null;
+
+ public PersianNormalizationFilter(TokenStream input) {
+ super(input);
+ normalizer = new PersianNormalizer();
+ }
+
+
+
+ public Token next(Token reusableToken) throws IOException {
+ if ((reusableToken = input.next(reusableToken)) == null) {
+ return null;
+ } else {
+ int oldlen = reusableToken.termLength();
+ int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen);
+ if (oldlen != newlen)
+ reusableToken.setTermLength(newlen);
+ return reusableToken;
+ }
+ }
+}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java (revision 0)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/fa/PersianNormalizer.java (revision 0)
@@ -0,0 +1,91 @@
+package org.apache.lucene.analysis.fa;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Normalizer for Persian.
+ *
+ * Normalization is done in-place for efficiency, operating on a termbuffer.
+ *
+ * Normalization is defined as:
+ *
+ * - Normalization of various heh + hamza forms and heh goal to heh.
+ *
- Normalization of farsi yeh and yeh barree to arabic yeh
+ *
- Normalization of persian keheh to arabic kaf
+ *
+ *
+ */
+public class PersianNormalizer {
+ public static final char YEH = '\u064A';
+ public static final char FARSI_YEH = '\u06CC';
+ public static final char YEH_BARREE = '\u06D2';
+
+ public static final char KEHEH = '\u06A9';
+ public static final char KAF = '\u0643';
+
+ public static final char HAMZA_ABOVE = '\u0654';
+ public static final char HEH_YEH = '\u06C0';
+ public static final char HEH_GOAL = '\u06C1';
+ public static final char HEH = '\u0647';
+
+
+ /**
+ * Normalize an input buffer of Persian text
+ *
+ * @param s input buffer
+ * @param len length of input buffer
+ * @return length of input buffer after normalization
+ */
+ public int normalize(char s[], int len) {
+
+ for (int i = 0; i < len; i++) {
+ if (s[i] == FARSI_YEH || s[i] == YEH_BARREE)
+ s[i] = YEH;
+
+ if (s[i] == KEHEH)
+ s[i] = KAF;
+
+ if (s[i] == HEH_YEH || s[i] == HEH_GOAL)
+ s[i] = HEH;
+
+ if (s[i] == HAMZA_ABOVE) { // necessary for HEH + HAMZA
+ len = delete(s, i, len);
+ i--;
+ }
+ }
+
+ return len;
+ }
+
+ /**
+ * Delete a character in-place
+ *
+ * @param s Input Buffer
+ * @param pos Position of character to delete
+ * @param len length of input buffer
+ * @return length of input buffer after deletion
+ */
+ protected int delete(char s[], int pos, int len) {
+ if (pos < len)
+ System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
+
+ return len - 1;
+ }
+
+
+}
Index: contrib/analyzers/src/java/org/apache/lucene/analysis/fa/package.html
===================================================================
--- contrib/analyzers/src/java/org/apache/lucene/analysis/fa/package.html (revision 0)
+++ contrib/analyzers/src/java/org/apache/lucene/analysis/fa/package.html (revision 0)
@@ -0,0 +1,5 @@
+
+
+Analyzer for Persian.
+
+
Index: contrib/analyzers/src/resources/org/apache/lucene/analysis/fa/stopwords.txt
===================================================================
--- contrib/analyzers/src/resources/org/apache/lucene/analysis/fa/stopwords.txt (revision 0)
+++ contrib/analyzers/src/resources/org/apache/lucene/analysis/fa/stopwords.txt (revision 0)
@@ -0,0 +1,311 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+انان
+نداشته
+سراسر
+خياه
+ايشان
+وي
+تاكنون
+بيشتري
+دوم
+پس
+ناشي
+وگو
+يا
+داشتند
+سپس
+هنگام
+هرگز
+پنج
+نشان
+امسال
+ديگر
+گروهي
+شدند
+چطور
+ده
+و
+دو
+نخستين
+ولي
+چرا
+چه
+وسط
+ه
+كدام
+قابل
+يك
+رفت
+هفت
+همچنين
+در
+هزار
+بله
+بلي
+شايد
+اما
+شناسي
+گرفته
+دهد
+داشته
+دانست
+داشتن
+خواهيم
+ميليارد
+وقتيكه
+امد
+خواهد
+جز
+اورده
+شده
+بلكه
+خدمات
+شدن
+برخي
+نبود
+بسياري
+جلوگيري
+حق
+كردند
+نوعي
+بعري
+نكرده
+نظير
+نبايد
+بوده
+بودن
+داد
+اورد
+هست
+جايي
+شود
+دنبال
+داده
+بايد
+سابق
+هيچ
+همان
+انجا
+كمتر
+كجاست
+گردد
+كسي
+تر
+مردم
+تان
+دادن
+بودند
+سري
+جدا
+ندارند
+مگر
+يكديگر
+دارد
+دهند
+بنابراين
+هنگامي
+سمت
+جا
+انچه
+خود
+دادند
+زياد
+دارند
+اثر
+بدون
+بهترين
+بيشتر
+البته
+به
+براساس
+بيرون
+كرد
+بعضي
+گرفت
+توي
+اي
+ميليون
+او
+جريان
+تول
+بر
+مانند
+برابر
+باشيم
+مدتي
+گويند
+اكنون
+تا
+تنها
+جديد
+چند
+بي
+نشده
+كردن
+كردم
+گويد
+كرده
+كنيم
+نمي
+نزد
+روي
+قصد
+فقط
+بالاي
+ديگران
+اين
+ديروز
+توسط
+سوم
+ايم
+دانند
+سوي
+استفاده
+شما
+كنار
+داريم
+ساخته
+طور
+امده
+رفته
+نخست
+بيست
+نزديك
+طي
+كنيد
+از
+انها
+تمامي
+داشت
+يكي
+طريق
+اش
+چيست
+روب
+نمايد
+گفت
+چندين
+چيزي
+تواند
+ام
+ايا
+با
+ان
+ايد
+ترين
+اينكه
+ديگري
+راه
+هايي
+بروز
+همچنان
+پاعين
+كس
+حدود
+مختلف
+مقابل
+چيز
+گيرد
+ندارد
+ضد
+همچون
+سازي
+شان
+مورد
+باره
+مرسي
+خويش
+برخوردار
+چون
+خارج
+شش
+هنوز
+تحت
+ضمن
+هستيم
+گفته
+فكر
+بسيار
+پيش
+براي
+روزهاي
+انكه
+نخواهد
+بالا
+كل
+وقتي
+كي
+چنين
+كه
+گيري
+نيست
+است
+كجا
+كند
+نيز
+يابد
+بندي
+حتي
+توانند
+عقب
+خواست
+كنند
+بين
+تمام
+همه
+ما
+باشند
+مثل
+شد
+اري
+باشد
+اره
+طبق
+بعد
+اگر
+صورت
+غير
+جاي
+بيش
+ريزي
+اند
+زيرا
+چگونه
+بار
+لطفا
+مي
+درباره
+من
+ديده
+همين
+گذاري
+برداري
+علت
+گذاشته
+هم
+فوق
+نه
+ها
+شوند
+اباد
+همواره
+هر
+اول
+خواهند
+چهار
+نام
+امروز
+مان
+هاي
+قبل
+كنم
+سعي
+تازه
+را
+هستند
+زير
+جلوي
+عنوان
+بود
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (revision 0)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianAnalyzer.java (revision 0)
@@ -0,0 +1,210 @@
+package org.apache.lucene.analysis.fa;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Test the Persian Analyzer
+ *
+ */
+public class TestPersianAnalyzer extends TestCase {
+
+ /** This test fails with NPE when the
+ * stopwords file is missing in classpath */
+ public void testResourcesAvailable() {
+ new PersianAnalyzer();
+ }
+
+ /**
+ * This test shows how the combination of tokenization (breaking on zero-width non-joiner),
+ * normalization (such as treating arabic YEH and farsi YEH the same),
+ * and stopwords creates a light-stemming effect for verbs.
+ *
+ * These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
+ */
+ public void testBehaviorVerbs() throws Exception {
+ Analyzer a = new PersianAnalyzer();
+ // active present indicative
+ assertAnalyzesTo(a, "میخورد", new String[] { "خورد" });
+ // active preterite indicative
+ assertAnalyzesTo(a, "خورد", new String[] { "خورد" });
+ // active imperfective preterite indicative
+ assertAnalyzesTo(a, "میخورد", new String[] { "خورد" });
+ // active future indicative
+ assertAnalyzesTo(a, "خواهد خورد", new String[] { "خورد" });
+ // active present progressive indicative
+ assertAnalyzesTo(a, "دارد میخورد", new String[] { "خورد" });
+ // active preterite progressive indicative
+ assertAnalyzesTo(a, "داشت میخورد", new String[] { "خورد" });
+
+ // active perfect indicative
+ assertAnalyzesTo(a, "خوردهاست", new String[] { "خورده" });
+ // active imperfective perfect indicative
+ assertAnalyzesTo(a, "میخوردهاست", new String[] { "خورده" });
+ // active pluperfect indicative
+ assertAnalyzesTo(a, "خورده بود", new String[] { "خورده" });
+ // active imperfective pluperfect indicative
+ assertAnalyzesTo(a, "میخورده بود", new String[] { "خورده" });
+ // active preterite subjunctive
+ assertAnalyzesTo(a, "خورده باشد", new String[] { "خورده" });
+ // active imperfective preterite subjunctive
+ assertAnalyzesTo(a, "میخورده باشد", new String[] { "خورده" });
+ // active pluperfect subjunctive
+ assertAnalyzesTo(a, "خورده بوده باشد", new String[] { "خورده" });
+ // active imperfective pluperfect subjunctive
+ assertAnalyzesTo(a, "میخورده بوده باشد", new String[] { "خورده" });
+ // passive present indicative
+ assertAnalyzesTo(a, "خورده میشود", new String[] { "خورده" });
+ // passive preterite indicative
+ assertAnalyzesTo(a, "خورده شد", new String[] { "خورده" });
+ // passive imperfective preterite indicative
+ assertAnalyzesTo(a, "خورده میشد", new String[] { "خورده" });
+ // passive perfect indicative
+ assertAnalyzesTo(a, "خورده شدهاست", new String[] { "خورده" });
+ // passive imperfective perfect indicative
+ assertAnalyzesTo(a, "خورده میشدهاست", new String[] { "خورده" });
+ // passive pluperfect indicative
+ assertAnalyzesTo(a, "خورده شده بود", new String[] { "خورده" });
+ // passive imperfective pluperfect indicative
+ assertAnalyzesTo(a, "خورده میشده بود", new String[] { "خورده" });
+ // passive future indicative
+ assertAnalyzesTo(a, "خورده خواهد شد", new String[] { "خورده" });
+ // passive present progressive indicative
+ assertAnalyzesTo(a, "دارد خورده میشود", new String[] { "خورده" });
+ // passive preterite progressive indicative
+ assertAnalyzesTo(a, "داشت خورده میشد", new String[] { "خورده" });
+ // passive present subjunctive
+ assertAnalyzesTo(a, "خورده شود", new String[] { "خورده" });
+ // passive preterite subjunctive
+ assertAnalyzesTo(a, "خورده شده باشد", new String[] { "خورده" });
+ // passive imperfective preterite subjunctive
+ assertAnalyzesTo(a, "خورده میشده باشد", new String[] { "خورده" });
+ // passive pluperfect subjunctive
+ assertAnalyzesTo(a, "خورده شده بوده باشد", new String[] { "خورده" });
+ // passive imperfective pluperfect subjunctive
+ assertAnalyzesTo(a, "خورده میشده بوده باشد", new String[] { "خورده" });
+
+ // active present subjunctive
+ assertAnalyzesTo(a, "بخورد", new String[] { "بخورد" });
+ }
+
+ /**
+ * This test shows how the combination of tokenization and stopwords creates a light-stemming effect for verbs.
+ *
+ * In this case, these forms are presented with alternative orthography, using arabic yeh and whitespace.
+ * This yeh phenomenon is common for legacy text due to some previous bugs in Microsoft Windows.
+ *
+ * These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
+ */
+ public void testBehaviorVerbsDefective() throws Exception {
+ Analyzer a = new PersianAnalyzer();
+ // active present indicative
+ assertAnalyzesTo(a, "مي خورد", new String[] { "خورد" });
+ // active preterite indicative
+ assertAnalyzesTo(a, "خورد", new String[] { "خورد" });
+ // active imperfective preterite indicative
+ assertAnalyzesTo(a, "مي خورد", new String[] { "خورد" });
+ // active future indicative
+ assertAnalyzesTo(a, "خواهد خورد", new String[] { "خورد" });
+ // active present progressive indicative
+ assertAnalyzesTo(a, "دارد مي خورد", new String[] { "خورد" });
+ // active preterite progressive indicative
+ assertAnalyzesTo(a, "داشت مي خورد", new String[] { "خورد" });
+
+ // active perfect indicative
+ assertAnalyzesTo(a, "خورده است", new String[] { "خورده" });
+ // active imperfective perfect indicative
+ assertAnalyzesTo(a, "مي خورده است", new String[] { "خورده" });
+ // active pluperfect indicative
+ assertAnalyzesTo(a, "خورده بود", new String[] { "خورده" });
+ // active imperfective pluperfect indicative
+ assertAnalyzesTo(a, "مي خورده بود", new String[] { "خورده" });
+ // active preterite subjunctive
+ assertAnalyzesTo(a, "خورده باشد", new String[] { "خورده" });
+ // active imperfective preterite subjunctive
+ assertAnalyzesTo(a, "مي خورده باشد", new String[] { "خورده" });
+ // active pluperfect subjunctive
+ assertAnalyzesTo(a, "خورده بوده باشد", new String[] { "خورده" });
+ // active imperfective pluperfect subjunctive
+ assertAnalyzesTo(a, "مي خورده بوده باشد", new String[] { "خورده" });
+ // passive present indicative
+ assertAnalyzesTo(a, "خورده مي شود", new String[] { "خورده" });
+ // passive preterite indicative
+ assertAnalyzesTo(a, "خورده شد", new String[] { "خورده" });
+ // passive imperfective preterite indicative
+ assertAnalyzesTo(a, "خورده مي شد", new String[] { "خورده" });
+ // passive perfect indicative
+ assertAnalyzesTo(a, "خورده شده است", new String[] { "خورده" });
+ // passive imperfective perfect indicative
+ assertAnalyzesTo(a, "خورده مي شده است", new String[] { "خورده" });
+ // passive pluperfect indicative
+ assertAnalyzesTo(a, "خورده شده بود", new String[] { "خورده" });
+ // passive imperfective pluperfect indicative
+ assertAnalyzesTo(a, "خورده مي شده بود", new String[] { "خورده" });
+ // passive future indicative
+ assertAnalyzesTo(a, "خورده خواهد شد", new String[] { "خورده" });
+ // passive present progressive indicative
+ assertAnalyzesTo(a, "دارد خورده مي شود", new String[] { "خورده" });
+ // passive preterite progressive indicative
+ assertAnalyzesTo(a, "داشت خورده مي شد", new String[] { "خورده" });
+ // passive present subjunctive
+ assertAnalyzesTo(a, "خورده شود", new String[] { "خورده" });
+ // passive preterite subjunctive
+ assertAnalyzesTo(a, "خورده شده باشد", new String[] { "خورده" });
+ // passive imperfective preterite subjunctive
+ assertAnalyzesTo(a, "خورده مي شده باشد", new String[] { "خورده" });
+ // passive pluperfect subjunctive
+ assertAnalyzesTo(a, "خورده شده بوده باشد", new String[] { "خورده" });
+ // passive imperfective pluperfect subjunctive
+ assertAnalyzesTo(a, "خورده مي شده بوده باشد", new String[] { "خورده" });
+
+ // active present subjunctive
+ assertAnalyzesTo(a, "بخورد", new String[] { "بخورد" });
+ }
+
+ /**
+ * This test shows how the combination of tokenization (breaking on zero-width non-joiner or space)
+ * and stopwords creates a light-stemming effect for nouns, removing the plural -ha.
+ */
+ public void testBehaviorNouns() throws Exception {
+ Analyzer a = new PersianAnalyzer();
+ assertAnalyzesTo(a, "برگ ها", new String[] { "برگ" });
+ assertAnalyzesTo(a, "برگها", new String[] { "برگ" });
+ }
+
+
+ public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
+ TokenStream ts = a.tokenStream("dummy", new StringReader(input));
+ final Token reusableToken = new Token();
+ for (int i = 0; i < output.length; i++) {
+ Token nextToken = ts.next(reusableToken);
+ assertNotNull(nextToken);
+ assertEquals(nextToken.term(), output[i]);
+ }
+ assertNull(ts.next(reusableToken));
+ ts.close();
+ }
+
+}
Index: contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java
===================================================================
--- contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java (revision 0)
+++ contrib/analyzers/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilter.java (revision 0)
@@ -0,0 +1,75 @@
+package org.apache.lucene.analysis.fa;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Test the Arabic Normalization Filter
+ *
+ */
+public class TestPersianNormalizationFilter extends TestCase {
+
+ public void testFarsiYeh() throws IOException {
+ check("های", "هاي");
+ }
+
+ public void testYehBarree() throws IOException {
+ check("هاے", "هاي");
+ }
+
+ public void testKeheh() throws IOException {
+ check("کشاندن", "كشاندن");
+ }
+
+ public void testHehYeh() throws IOException {
+ check("كتابۀ", "كتابه");
+ }
+
+ public void testHehHamzaAbove() throws IOException {
+ check("كتابهٔ", "كتابه");
+ }
+
+ public void testHehGoal() throws IOException {
+ check("زادہ", "زاده");
+ }
+
+ private void check(final String input, final String expected) throws IOException {
+ ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
+ PersianNormalizationFilter filter = new PersianNormalizationFilter(tokenStream);
+ final Token reusableToken = new Token();
+ Token nextToken = filter.next(reusableToken);
+ if (nextToken == null)
+ fail();
+ assertEquals(expected, nextToken.term());
+ filter.close();
+ }
+
+}