Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java	(revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java	(revision 0)
@@ -0,0 +1,180 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WordlistLoader;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for Bulgarian.
+ * <p>
+ * This analyzer implements light-stemming as specified by: 
+ * <i>
+ * Searching Strategies for the Bulgarian Language
+ * </i>
+ * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
+ * <p>
+ */
+public class BulgarianAnalyzer extends Analyzer {
+
+  /**
+   * File containing default Bulgarian stopwords.
+   * 
+   * Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html
+   * The stopword list is BSD-Licensed.
+   */
+  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
+  
+  /**
+   * Contains the stopwords used with the StopFilter.
+   */
+  private final Set<?> stoptable;
+  /**
+   * The comment character in the stopwords file.  All lines prefixed with this will be ignored  
+   */
+  public static final String STOPWORDS_COMMENT = "#";
+  
+  /**
+   * Returns an unmodifiable instance of the default stop-words set.
+   * @return an unmodifiable instance of the default stop-words set.
+   */
+  public static Set<String> getDefaultStopSet(){
+    return DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+  
+  /**
+   * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+   * accesses the static final set the first time.;
+   */
+  private static class DefaultSetHolder {
+    static final Set<String> DEFAULT_STOP_SET;
+
+    static {
+      try {
+        DEFAULT_STOP_SET = loadDefaultStopWordSet();
+      } catch (IOException ex) {
+        // default set should always be present as it is part of the
+        // distribution (JAR)
+        throw new RuntimeException("Unable to load default stopword set");
+      }
+    }
+
+    static Set<String> loadDefaultStopWordSet() throws IOException {
+      InputStream stream = BulgarianAnalyzer.class
+          .getResourceAsStream(DEFAULT_STOPWORD_FILE);
+      try {
+        InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
+        // make sure it is unmodifiable as we expose it in the outer class
+        return Collections.unmodifiableSet(WordlistLoader.getWordSet(reader,
+            STOPWORDS_COMMENT));
+      } finally {
+        stream.close();
+      }
+    }
+  }
+
+  private final Version matchVersion;
+
+  /**
+   * Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}.
+   */
+  public BulgarianAnalyzer(Version matchVersion) {
+    super();
+    this.matchVersion = matchVersion;
+    stoptable = DefaultSetHolder.DEFAULT_STOP_SET;
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   */
+  public BulgarianAnalyzer(Version matchVersion, String... stopwords) {
+    super();
+    stoptable = StopFilter.makeStopSet( stopwords );
+    this.matchVersion = matchVersion;
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   * Lines can be commented out using {@link #STOPWORDS_COMMENT}
+   */
+  public BulgarianAnalyzer(Version matchVersion, File stopwords) throws IOException {
+    super();
+    stoptable = WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT);
+    this.matchVersion = matchVersion;
+  }
+
+  /**
+   * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
+   *
+   * @return  A {@link TokenStream} built from an {@link StandardTokenizer} filtered with
+   *            {@link LowerCaseFilter}, {@link StopFilter}, and {@link BulgarianStemFilter}.
+   */
+  @Override
+  public final TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new StandardTokenizer(matchVersion, reader);
+    result = new LowerCaseFilter(result);
+    result = new StopFilter( StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
+                             result, stoptable );
+    result = new BulgarianStemFilter( result );
+    return result;
+  }
+  
+  private class SavedStreams {
+    Tokenizer source;
+    TokenStream result;
+  };
+  
+  /**
+   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text 
+   * in the provided {@link Reader}.
+   *
+   * @return  A {@link TokenStream} built from an {@link StandardTokenizer} filtered with
+   *            {@link LowerCaseFilter}, {@link StopFilter}, and {@link BulgarianStemFilter}.
+   */
+  @Override
+  public TokenStream reusableTokenStream(String fieldName, Reader reader)
+      throws IOException {
+    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+    if (streams == null) {
+      streams = new SavedStreams();
+      streams.source = new StandardTokenizer(matchVersion, reader);
+      streams.result = new LowerCaseFilter(streams.source);
+      streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
+                                      streams.result, stoptable);
+      streams.result = new BulgarianStemFilter(streams.result);
+      setPreviousTokenStream(streams);
+    } else {
+      streams.source.reset(reader);
+    }
+    return streams.result;
+  }
+}

Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\bg\BulgarianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java	(revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilter.java	(revision 0)
@@ -0,0 +1,49 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian words.
+ */
+public class BulgarianStemFilter extends TokenFilter {
+  private final BulgarianStemmer stemmer;
+  private final TermAttribute termAtt;
+  
+  public BulgarianStemFilter(TokenStream input) {
+    super(input);
+    stemmer = new BulgarianStemmer();
+    termAtt = addAttribute(TermAttribute.class);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+      termAtt.setTermLength(newlen);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}

Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\bg\BulgarianStemFilter.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java
===================================================================
--- contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java	(revision 0)
+++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemmer.java	(revision 0)
@@ -0,0 +1,154 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light Stemmer for Bulgarian.
+ * <p>
+ * Implements the algorithm described in:  
+ * <i>
+ * Searching Strategies for the Bulgarian Language
+ * </i>
+ * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
+ */
+public class BulgarianStemmer {
+  
+  /**
+   * Stem an input buffer of Bulgarian text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   */
+  public int stem(char s[], int len) {
+    if (len < 4) // do not stem
+      return len;
+    
+    if (len > 5 && endsWith(s, len, "ища"))
+      return len - 3;
+    
+    len = removeArticle(s, len);
+    len = removePlural(s, len);
+    
+    if (len > 3) {
+      if (endsWith(s, len, "я"))
+        len--;
+      if (endsWith(s, len, "а") ||
+          endsWith(s, len, "о") ||
+          endsWith(s, len, "е"))
+        len--;
+      if (endsWith(s, len, "ен")) {
+        s[len - 2] = 'н'; // replace with н
+        len--;
+      }
+    }
+    
+    if (len > 4 && endsWith(s, len, "ен")) {
+      s[len - 2] = 'н'; // replace with н
+      len--;
+    }
+    
+    if (len > 5 && s[len - 2] == 'ъ') {
+      s[len - 2] = s[len - 1]; // replace ъN with N
+      len--;
+    }
+
+    return len;
+  }
+  
+  /**
+   * Mainly remove the definite article
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return new stemmed length
+   */
+  private int removeArticle(char s[], int len) {
+    if (len > 6 && endsWith(s, len, "ият"))
+      return len - 3;
+    
+    if (len > 5) {
+      if (endsWith(s, len, "ът") ||
+          endsWith(s, len, "то") ||
+          endsWith(s, len, "те") ||
+          endsWith(s, len, "та") ||
+          endsWith(s, len, "ия"))
+        return len - 2;
+    }
+    
+    if (len > 4 && endsWith(s, len, "ят"))
+      return len - 2;
+
+    return len;
+  }
+  
+  private int removePlural(char s[], int len) {
+    if (len > 6) {
+      if (endsWith(s, len, "овци"))
+        return len - 3; // replace with о
+      if (endsWith(s, len, "ове"))
+        return len - 3;
+      if (endsWith(s, len, "еве")) {
+        s[len - 3] = 'й'; // replace with й
+        return len - 2;
+      }
+    }
+    
+    if (len > 5) {
+      if (endsWith(s, len, "ища"))
+        return len - 3;
+      if (endsWith(s, len, "та"))
+        return len - 2;
+      if (endsWith(s, len, "ци")) {
+        s[len - 2] = 'к'; // replace with к
+        return len - 1;
+      }
+      if (endsWith(s, len, "зи")) {
+        s[len - 2] = 'г'; // replace with г
+        return len - 1;
+      }
+      
+      if (s[len - 3] == 'е' && s[len - 1] == 'и') {
+        s[len - 3] = 'я'; // replace е with я, remove и
+        return len - 1;
+      }
+    }
+    
+    if (len > 4) {
+      if (endsWith(s, len, "си")) {
+        s[len - 2] = 'х'; // replace with х
+        return len - 1;
+      }
+      if (endsWith(s, len, "и"))
+        return len - 1;
+    }
+    
+    return len;
+  }
+  
+  private boolean endsWith(char s[], int len, String suffix) {
+    int suffixLen = suffix.length();
+    if (suffixLen > len)
+      return false;
+    
+    for (int i = suffixLen - 1; i >= 0; i--)
+      if (s[len - (suffixLen - i)] != suffix.charAt(i))
+        return false;
+    
+    return true;
+  }
+}

Property changes on: contrib\analyzers\common\src\java\org\apache\lucene\analysis\bg\BulgarianStemmer.java
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt
===================================================================
--- contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt	(revision 0)
+++ contrib/analyzers/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt	(revision 0)
@@ -0,0 +1,262 @@
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+а
+автентичен
+аз
+ако
+ала
+бе
+без
+беше
+би
+бивш
+бивша
+бившо
+бил
+била
+били
+било
+благодаря
+близо
+бъдат
+бъде
+бяха
+в
+вас
+ваш
+ваша
+вероятно
+вече
+взема
+ви
+вие
+винаги
+внимава
+време
+все
+всеки
+всички
+всичко
+всяка
+във
+въпреки
+върху
+г
+ги
+главен
+главна
+главно
+глас
+го
+година
+години
+годишен
+д
+да
+дали
+два
+двама
+двамата
+две
+двете
+ден
+днес
+дни
+до
+добра
+добре
+добро
+добър
+докато
+докога
+дори
+досега
+доста
+друг
+друга
+други
+е
+евтин
+едва
+един
+една
+еднаква
+еднакви
+еднакъв
+едно
+екип
+ето
+живот
+за
+забавям
+зад
+заедно
+заради
+засега
+заспал
+затова
+защо
+защото
+и
+из
+или
+им
+има
+имат
+иска
+й
+каза
+как
+каква
+какво
+както
+какъв
+като
+кога
+когато
+което
+които
+кой
+който
+колко
+която
+къде
+където
+към
+лесен
+лесно
+ли
+лош
+м
+май
+малко
+ме
+между
+мек
+мен
+месец
+ми
+много
+мнозина
+мога
+могат
+може
+мокър
+моля
+момента
+му
+н
+на
+над
+назад
+най
+направи
+напред
+например
+нас
+не
+него
+нещо
+нея
+ни
+ние
+никой
+нито
+нищо
+но
+нов
+нова
+нови
+новина
+някои
+някой
+няколко
+няма
+обаче
+около
+освен
+особено
+от
+отгоре
+отново
+още
+пак
+по
+повече
+повечето
+под
+поне
+поради
+после
+почти
+прави
+пред
+преди
+през
+при
+пък
+първата
+първи
+първо
+пъти
+равен
+равна
+с
+са
+сам
+само
+се
+сега
+си
+син
+скоро
+след
+следващ
+сме
+смях
+според
+сред
+срещу
+сте
+съм
+със
+също
+т
+тази
+така
+такива
+такъв
+там
+твой
+те
+тези
+ти
+т.н.
+то
+това
+тогава
+този
+той
+толкова
+точно
+три
+трябва
+тук
+тъй
+тя
+тях
+у
+утре
+харесва
+хиляди
+ч
+часа
+че
+често
+чрез
+ще
+щом
+юмрук
+я
+як

Property changes on: contrib\analyzers\common\src\resources\org\apache\lucene\analysis\bg\stopwords.txt
___________________________________________________________________
Added: svn:eol-style
   + native

Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java
===================================================================
--- contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java	(revision 0)
+++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianAnalyzer.java	(revision 0)
@@ -0,0 +1,93 @@
+package org.apache.lucene.analysis.bg;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.util.Version;
+
+/**
+ * Test the Bulgarian analyzer / light stemmer
+ */
+public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
+  
+  /** This test fails with NPE when the 
+   * stopwords file is missing in classpath */
+  public void testResourcesAvailable() {
+    new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+  }
+  
+  public void testStopwords() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "Как се казваш?", new String[] { "казваш" });
+  }
+  
+  public void testCustomStopwords() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, new String[] {});
+    assertAnalyzesTo(a, "Как се казваш?", new String[] { "как", "се", "казваш" });
+  }
+  
+  public void testReusableTokenStream() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesToReuse(a, "документи", new String[] { "документ" });
+    assertAnalyzesToReuse(a, "документ", new String[] { "документ" });
+  }
+
+  /**
+   * Test some examples from the paper
+   */
+  public void testBasicExamples() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "енергийни кризи", new String[] { "енергийн", "криз" });
+    assertAnalyzesTo(a, "Атомната енергия", new String[] { "атомн", "енерг" });
+    
+    assertAnalyzesTo(a, "компютри", new String[] { "компютр" });
+    assertAnalyzesTo(a, "компютър", new String[] { "компютр" });
+    
+    assertAnalyzesTo(a, "градове", new String[] { "град" });
+  }
+  
+  /**
+   * test removal of plural suffix
+   */
+  public void testPlural() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "документи", new String[] { "документ" });
+    assertAnalyzesTo(a, "документ", new String[] { "документ" });
+  }
+  
+  /**
+   * test removal of article suffix
+   */
+  public void testArticle() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "първият", new String[] { "първ" }); 
+  }
+  
+  /**
+   * test rewrite rules in this stemming algorithm
+   */
+  public void testRewrite() throws IOException {
+    Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
+    assertAnalyzesTo(a, "промени", new String[] { "промян" });
+    assertAnalyzesTo(a, "защитници", new String[] { "защитник" });
+    assertAnalyzesTo(a, "хирурзите", new String[] { "хирург" });
+  }
+}

Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\bg\TestBulgarianAnalyzer.java
___________________________________________________________________
Added: svn:eol-style
   + native

