Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 699362) +++ CHANGES.txt (working copy) @@ -1,4 +1,4 @@ -Lucene Change Log +Lucene Change Log $Id$ ======================= Trunk (not yet released) ======================= @@ -11,6 +11,8 @@ New features + 1. LUCENE-1406: Added Arabic analyzer. (Robert Muir) + Optimizations Documentation Index: contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicStemFilter.java (revision 0) @@ -0,0 +1,125 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Test the Arabic Normalization Filter + * + */ +public class TestArabicStemFilter extends TestCase { + + public void testAlPrefix() throws IOException { + check("الحسن", "حسن"); + } + + public void testWalPrefix() throws IOException { + check("والحسن", "حسن"); + } + + public void testBalPrefix() throws IOException { + check("بالحسن", "حسن"); + } + + public void testKalPrefix() throws IOException { + check("كالحسن", "حسن"); + } + + public void testFalPrefix() throws IOException { + check("فالحسن", "حسن"); + } + + public void testWaPrefix() throws IOException { + check("وحسن", "حسن"); + } + + public void testAhSuffix() throws IOException { + check("زوجها", "زوج"); + } + + public void testAnSuffix() throws IOException { + check("ساهدان", "ساهد"); + } + + public void testAtSuffix() throws IOException { + check("ساهدات", "ساهد"); + } + + public void testWnSuffix() throws IOException { + check("ساهدون", "ساهد"); + } + + public void testYnSuffix() throws IOException { + check("ساهدين", "ساهد"); + } + + public void testYhSuffix() throws IOException { + check("ساهديه", "ساهد"); + } + + public void testYpSuffix() throws IOException { + check("ساهدية", "ساهد"); + } + + public void testHSuffix() throws IOException { + check("ساهده", "ساهد"); + } + + public void testPSuffix() throws IOException { + check("ساهدة", "ساهد"); + } + + public void testYSuffix() throws IOException { + check("ساهدي", "ساهد"); + } + + public void testComboPrefSuf() throws IOException { + check("وساهدون", "ساهد"); + } + + public void testComboSuf() throws IOException { + check("ساهدهات", "ساهد"); + } + + public void testShouldntStem() throws IOException { + check("الو", "الو"); + } + + private void check(final String input, final String expected) throws IOException { + ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); + ArabicStemFilter filter = new ArabicStemFilter(tokenStream); + final Token reusableToken = new Token(); + Token nextToken = filter.next(reusableToken); + if (nextToken == null) + fail(); + assertEquals(expected, nextToken.term()); + filter.close(); + } + +} Index: contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/ar/TestArabicNormalizationFilter.java (revision 0) @@ -0,0 +1,106 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.StringReader; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Test the Arabic Normalization Filter + * + */ +public class TestArabicNormalizationFilter extends TestCase { + + public void testAlifMadda() throws IOException { + check("آجن", "اجن"); + } + + public void testAlifHamzaAbove() throws IOException { + check("أحمد", "احمد"); + } + + public void testAlifHamzaBelow() throws IOException { + check("إعاذ", "اعاذ"); + } + + public void testAlifMaksura() throws IOException { + check("بنى", "بني"); + } + + public void testTehMarbuta() throws IOException { + check("فاطمة", "فاطمه"); + } + + public void testTatweel() throws IOException { + check("روبرـــــت", "روبرت"); + } + + public void testFatha() throws IOException { + check("مَبنا", "مبنا"); + } + + public void testKasra() throws IOException { + check("علِي", "علي"); + } + + public void testDamma() throws IOException { + check("بُوات", "بوات"); + } + + public void testFathatan() throws IOException { + check("ولداً", "ولدا"); + } + + public void testKasratan() throws IOException { + check("ولدٍ", "ولد"); + } + + public void testDammatan() throws IOException { + check("ولدٌ", "ولد"); + } + + public void testSukun() throws IOException { + check("نلْسون", "نلسون"); + } + + public void testShaddah() throws IOException { + check("هتميّ", "هتمي"); + } + + private void check(final String input, final String expected) throws IOException { + ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input)); + ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream); + final Token reusableToken = new Token(); + Token nextToken = filter.next(reusableToken); + if (nextToken == null) + fail(); + assertEquals(expected, nextToken.term()); + filter.close(); + } + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemFilter.java (revision 0) @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +/** + * A TokenFilter that applies {@link ArabicStemmer} to stem arabic words.. + * + */ + +public class ArabicStemFilter extends TokenFilter { + + protected ArabicStemmer stemmer = null; + + protected ArabicStemFilter(TokenStream input) { + super(input); + stemmer = new ArabicStemmer(); + } + + /** + * @return Returns the next token in the stream, or null at EOS + */ + public final Token next() throws IOException { + /** + * The actual token in the input stream. + */ + Token token = null; + + if ((token = input.next()) == null) { + return null; + } else { + int oldlen = token.termLength(); + int newlen = stemmer.stem(token.termBuffer(), oldlen); + if (oldlen != newlen) + token.setTermLength(newlen); + return token; + } + } + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt (revision 0) @@ -0,0 +1,347 @@ +ب +ا +أ +، +عشر +عبد +عدد +عدة +عشرة +عدم +عام +عاما +عرفات +عن +عند +عمان +عندما +على +علي +عليه +عليها +عملية +زيارة +سبتمبر +ساراييفو +سنة +سوريا +سنوات +تشرين +تم +تموز +ضد +بعد +بعض +اعادة +اعلن +اعلنت +حزب +حزيران +بسبب +اسرائيل +حسين +حتى +اتفاق +صرب +اذا +احد +اثر +غزة +برس +باسم +اجتماع +غدا +شخصا +صباح +اطار +اربعة +بغداد +اخرى +باريس +رابين +شرق +بان +ابو +اجل +غير +حركة +رئيس +جديدة +اطلاق +بشكل +بطولة +صحيفة +حاليا +بن +به +ثم +اف +ان +او +اي +بها +جهة +صفر +حيث +اكد +الا +اما +العسكرية +العراق +العاصمة +العربية +العراقي +العراقية +العام +العالم +العلاقات +العمل +امس +السعودية +الساعة +السبت +السابق +روسيا +السلطة +السلطات +السلام +التعاون +التحرير +التى +التي +اكتوبر +دورة +اكثر +ايار +ايضا +الجزائر +حماس +الاسرائيلي +الاسرائيلية +الاسبوع +الاسلحة +الاسلامية +ذكرت +الاتحاد +الاتفاق +ثلاثة +الحرب +الاحد +الذاتي +الشرطة +الاربعاء +الغربية +الخارجية +الاردن +الشرق +ايران +الحدود +الرئيس +الاخيرة +الثاني +الثانية +الاثنين +شمال +بيان +دمشق +الذى +الذي +الان +امام +ايام +خلال +الشيخ +الجيش +الدور +الضفة +الجمعة +بيريز +الاوسط +الروسي +البوسنة +الروسية +بيروت +الانتخابات +البلاد +الدفاع +الثلثاء +الانباء +الثلاثاء +الاوروبي +حوالى +الذين +الدول +الحكم +الامم +الامن +الاول +الدولة +الخليج +الخميس +الاميركي +الاميركية +الدولي +الاولى +الدولية +الحكومة +بين +ذلك +دول +دون +حول +حين +الف +الى +انه +اول +ضمن +جنوب +دولة +انها +جميع +الوزراء +المتحدث +المتحدة +دولار +النار +الوضع +القدس +المحتلة +المصدر +المباراة +المصري +الماضي +المصرية +المرحلة +القدم +اللجنة +المجلس +الفرنسي +الفرنسية +القاهرة +المدينة +المانيا +الوطنية +المجموعة +الله +الفلسطيني +الفلسطينية +الفلسطينيين +الوقت +المقرر +القوات +النهائي +المقبل +المنطقة +الولايات +المفاوضات +الملك +اليمن +اليوم +ايلول +الكويت +ـ +ف +و +و6 +قد +لا +ما +مع +وزارة +وزير +مساء +قتل +كرة +مصر +هذا +فاز +كأس +ياسر +قرار +مصدر +واحد +قطاع +مصادر +مباراة +مبارك +واضاف +واضافت +فرانس +واشنطن +فان +قبل +قال +كان +لدى +نحو +هذه +وان +محمد +واكد +يذكر +مجلس +فرنسا +كريستوفر +كانت +واوضح +لبنان +مايو +مدينة +مجموعة +كانون +فى +في +كل +لم +لن +له +من +هو +هي +قوة +كما +لها +منذ +وقد +ولا +نفسه +موسكو +مقتل +لقاء +لكرة +نقطة +قوات +مقابل +لندن +هناك +وقال +وكان +منطقة +منظمة +نهاية +وكالة +وقالت +وكانت +للامم +فيه +كلم +لكن +وفي +وقف +ولم +ومن +وهو +وهي +يوم +فيها +منها +مليار +لوكالة +يكون +يمكن +كلينتون +مليون +يوليو +يونيو +نيويورك Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java (revision 0) @@ -0,0 +1,102 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Normalizer for Arabic. + *

+ * Normalization is done in-place for efficiency, operating on a termbuffer. + *

+ * Normalization is defined as: + *

+ * + */ +public class ArabicNormalizer { + public static final char ALEF = '\u0627'; + public static final char ALEF_MADDA = '\u0622'; + public static final char ALEF_HAMZA_ABOVE = '\u0623'; + public static final char ALEF_HAMZA_BELOW = '\u0625'; + + public static final char YEH = '\u064A'; + public static final char DOTLESS_YEH = '\u0649'; + + public static final char TEH_MARBUTA = '\u0629'; + public static final char HEH = '\u0647'; + + public static final char TATWEEL = '\u0640'; + + public static final char FATHATAN = '\u064B'; + public static final char DAMMATAN = '\u064C'; + public static final char KASRATAN = '\u064D'; + public static final char FATHA = '\u064E'; + public static final char DAMMA = '\u064F'; + public static final char KASRA = '\u0650'; + public static final char SHADDA = '\u0651'; + public static final char SUKUN = '\u0652'; + + /** + * Normalize an input buffer of arabic text + * + * @param s input buffer + * @param len length of input buffer + * @return length of input buffer after normalization + */ + public int normalize(char s[], int len) { + + for (int i = 0; i < len; i++) { + if (s[i] == ALEF_MADDA || s[i] == ALEF_HAMZA_ABOVE || s[i] == ALEF_HAMZA_BELOW) + s[i] = ALEF; + + if (s[i] == DOTLESS_YEH) + s[i] = YEH; + + if (s[i] == TEH_MARBUTA) + s[i] = HEH; + + if (s[i] == TATWEEL || s[i] == KASRATAN || s[i] == DAMMATAN || s[i] == FATHATAN || + s[i] == FATHA || s[i] == DAMMA || s[i] == KASRA || s[i] == SHADDA || s[i] == SUKUN) { + len = delete(s, i, len); + i--; + } + } + + return len; + } + + /** + * Delete a character in-place + * + * @param s Input Buffer + * @param pos Position of character to delete + * @param len length of input buffer + * @return length of input buffer after deletion + */ + protected int delete(char s[], int pos, int len) { + if (pos < len) + System.arraycopy(s, pos + 1, s, pos, len - pos - 1); + + return len - 1; + } + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java (revision 0) @@ -0,0 +1,122 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Analyzer for Arabic. + *

+ * This analyzer implements light-stemming as specified by: + * + * Improving Stemming for Arabic Information Retrieval: + * Light Stemming and Co-occurrence Analysis + * + * http://ciir.cs.umass.edu/pubfiles/ir-249.pdf + *

+ * The analysis package contains three primary components: + *

+ * + */ +public final class ArabicAnalyzer extends Analyzer { + + /** + * File containing default arabic stopwords. + * + * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html + * The stopword list is BSD-Licensed. + */ + public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + /** + * Contains the stopwords used with the StopFilter. + */ + private Set stoptable = new HashSet(); + + /** + * Builds an analyzer with the default stop words . + */ + public ArabicAnalyzer() { + try { + InputStream stream = ArabicAnalyzer.class.getResourceAsStream(DEFAULT_STOPWORD_FILE); + InputStreamReader reader = new InputStreamReader(stream, "UTF-8"); + stoptable = WordlistLoader.getWordSet(reader); + reader.close(); + stream.close(); + } catch (IOException e) { + // TODO: throw IOException + throw new RuntimeException(e); + } + } + + /** + * Builds an analyzer with the given stop words. + */ + public ArabicAnalyzer( String[] stopwords ) { + stoptable = StopFilter.makeStopSet( stopwords ); + } + + /** + * Builds an analyzer with the given stop words. + */ + public ArabicAnalyzer( Hashtable stopwords ) { + stoptable = new HashSet(stopwords.keySet()); + } + + /** + * Builds an analyzer with the given stop words. + */ + public ArabicAnalyzer( File stopwords ) throws IOException { + stoptable = WordlistLoader.getWordSet( stopwords ); + } + + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a StandardTokenizer filtered with + * StandardFilter, StopFilter, ArabicNormalizationFilter and ArabicStemFilter. + */ + public final TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new ArabicLetterTokenizer( reader ); + result = new StopFilter( result, stoptable ); + result = new ArabicNormalizationFilter( result ); + result = new ArabicStemFilter( result ); + + return result; + } +} + Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (revision 0) @@ -0,0 +1,43 @@ +package org.apache.lucene.analysis.ar; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.LetterTokenizer; + +/** + * The problem with the standard Letter tokenizer is that it fails on diacritics. + * Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc. + * + * + */ +public class ArabicLetterTokenizer extends LetterTokenizer { + + public ArabicLetterTokenizer(Reader in) { + super(in); + } + + /** + * Allows for Letter category or NonspacingMark category + * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char) + */ + protected boolean isTokenChar(char c) { + return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK; + } + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilter.java (revision 0) @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +/** + * A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography. + * + */ + +public class ArabicNormalizationFilter extends TokenFilter { + + protected ArabicNormalizer normalizer = null; + + protected ArabicNormalizationFilter(TokenStream input) { + super(input); + normalizer = new ArabicNormalizer(); + } + + /** + * @return Returns the next token in the stream, or null at EOS + */ + public final Token next() throws IOException { + /** + * The actual token in the input stream. + */ + Token token = null; + + if ((token = input.next()) == null) { + return null; + } else { + int oldlen = token.termLength(); + int newlen = normalizer.normalize(token.termBuffer(), oldlen); + if (oldlen != newlen) + token.setTermLength(newlen); + return token; + } + } + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ArabicStemmer.java (revision 0) @@ -0,0 +1,177 @@ +package org.apache.lucene.analysis.ar; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Stemmer for Arabic. + *

+ * Stemming is done in-place for efficiency, operating on a termbuffer. + *

+ * Stemming is defined as: + *

+ * + */ +public class ArabicStemmer { + public static final char ALEF = '\u0627'; + public static final char BEH = '\u0628'; + public static final char TEH_MARBUTA = '\u0629'; + public static final char TEH = '\u062A'; + public static final char FEH = '\u0641'; + public static final char KAF = '\u0643'; + public static final char LAM = '\u0644'; + public static final char NOON = '\u0646'; + public static final char HEH = '\u0647'; + public static final char WAW = '\u0648'; + public static final char YEH = '\u064A'; + + public static final char prefixes[][] = { + ("" + ALEF + LAM).toCharArray(), + ("" + WAW + ALEF + LAM).toCharArray(), + ("" + BEH + ALEF + LAM).toCharArray(), + ("" + KAF + ALEF + LAM).toCharArray(), + ("" + FEH + ALEF + LAM).toCharArray(), + ("" + WAW).toCharArray(), + }; + + public static final char suffixes[][] = { + ("" + HEH + ALEF).toCharArray(), + ("" + ALEF + NOON).toCharArray(), + ("" + ALEF + TEH).toCharArray(), + ("" + WAW + NOON).toCharArray(), + ("" + YEH + NOON).toCharArray(), + ("" + YEH + HEH).toCharArray(), + ("" + YEH + TEH_MARBUTA).toCharArray(), + ("" + HEH).toCharArray(), + ("" + TEH_MARBUTA).toCharArray(), + ("" + YEH).toCharArray(), +}; + + /** + * Stem an input buffer of Arabic text. + * + * @param s input buffer + * @param len length of input buffer + * @return length of input buffer after normalization + */ + public int stem(char s[], int len) { + len = stemPrefix(s, len); + len = stemSuffix(s, len); + + return len; + } + + /** + * Stem a prefix off an Arabic word. + * @param s input buffer + * @param len length of input buffer + * @return new length of input buffer after stemming. + */ + public int stemPrefix(char s[], int len) { + for (int i = 0; i < prefixes.length; i++) + if (startsWith(s, len, prefixes[i])) + return deleteN(s, 0, len, prefixes[i].length); + return len; + } + + /** + * Stem suffix(es) off an Arabic word. + * @param s input buffer + * @param len length of input buffer + * @return new length of input buffer after stemming + */ + public int stemSuffix(char s[], int len) { + for (int i = 0; i < suffixes.length; i++) + if (endsWith(s, len, suffixes[i])) + len = deleteN(s, len - suffixes[i].length, len, suffixes[i].length); + return len; + } + + /** + * Returns true if the prefix matches and can be stemmed + * @param s input buffer + * @param len length of input buffer + * @param prefix prefix to check + * @return true if the prefix matches and can be stemmed + */ + boolean startsWith(char s[], int len, char prefix[]) { + if (prefix.length == 1 && len < 4) { // wa- prefix requires at least 3 characters + return false; + } else if (len < prefix.length + 2) { // other prefixes require only 2. + return false; + } else { + for (int i = 0; i < prefix.length; i++) + if (s[i] != prefix[i]) + return false; + + return true; + } + } + + /** + * Returns true if the suffix matches and can be stemmed + * @param s input buffer + * @param len length of input buffer + * @param suffix suffix to check + * @return true if the suffix matches and can be stemmed + */ + boolean endsWith(char s[], int len, char suffix[]) { + if (len < suffix.length + 2) { // all suffixes require at least 2 characters after stemming + return false; + } else { + for (int i = 0; i < suffix.length; i++) + if (s[len - suffix.length + i] != suffix[i]) + return false; + + return true; + } + } + + + /** + * Delete n characters in-place + * + * @param s Input Buffer + * @param pos Position of character to delete + * @param len Length of input buffer + * @param nChars number of characters to delete + * @return length of input buffer after deletion + */ + protected int deleteN(char s[], int pos, int len, int nChars) { + for (int i = 0; i < nChars; i++) + len = delete(s, pos, len); + return len; + } + + /** + * Delete a character in-place + * + * @param s Input Buffer + * @param pos Position of character to delete + * @param len length of input buffer + * @return length of input buffer after deletion + */ + protected int delete(char s[], int pos, int len) { + if (pos < len) + System.arraycopy(s, pos + 1, s, pos, len - pos - 1); + + return len - 1; + } + +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/package.html (revision 0) @@ -0,0 +1,5 @@ + + +Analyzer for Arabic. + +