Index: solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java
===================================================================
--- solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java (revision 0)
+++ solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java (revision 0)
@@ -0,0 +1,55 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter;
+import org.apache.solr.common.SolrException;
+
+import java.util.Map;
+
+/**
+ * Factory for {@link KuromojiKatakanaStemFilterFactory}.
+ *
+ * <fieldType name="text_ja" class="solr.TextField">
+ * <analyzer>
+ * <tokenizer class="solr.KuromojiTokenizerFactory"/>
+ * <filter class="solr.KuromojiKatakanaStemFilterFactory"
+ * minimumLength="4"/>
+ * </analyzer>
+ * </fieldType>
+ *
+ */
+public class KuromojiKatakanaStemFilterFactory extends BaseTokenFilterFactory {
+ private static final String MINIMUM_LENGTH_PARAM = "minimumLength";
+ private int minimumLength;
+
+ @Override
+ public void init(Map args) {
+ super.init(args);
+ minimumLength = getInt(MINIMUM_LENGTH_PARAM, KuromojiKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH);
+ if (minimumLength < 2) {
+ throw new SolrException(SolrException.ErrorCode.UNKNOWN,
+ "Illegal " + MINIMUM_LENGTH_PARAM + " " + minimumLength + " (must be 2 or greater)");
+ }
+ }
+
+ public TokenStream create(TokenStream input) {
+ return new KuromojiKatakanaStemFilter(input, minimumLength);
+ }
+}
Property changes on: solr/core/src/java/org/apache/solr/analysis/KuromojiKatakanaStemFilterFactory.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: solr/example/solr/conf/schema.xml
===================================================================
--- solr/example/solr/conf/schema.xml (revision 1303738)
+++ solr/example/solr/conf/schema.xml (working copy)
@@ -720,7 +720,9 @@
-
+
+
+
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (revision 1303738)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (working copy)
@@ -24,6 +24,9 @@
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.kuromoji.KuromojiTokenizer.Mode;
+/**
+ * Test Kuromoji Japanese morphological analyzer
+ */
public class TestKuromojiAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the
* stopwords file is missing in classpath */
@@ -54,27 +57,26 @@
KuromojiAnalyzer.getDefaultStopSet(),
KuromojiAnalyzer.getDefaultStopTags());
- /*
- //TokenStream ts = a.tokenStream("foo", new StringReader("妹の咲子です。俺と年子で、今受験生です。"));
- TokenStream ts = a.tokenStream("foo", new StringReader("?>-->;"));
- ts.reset();
- CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
- while(ts.incrementToken()) {
- System.out.println(" " + termAtt.toString());
- }
- System.out.println("DONE PARSE\n\n");
- */
-
// Senior software engineer:
assertAnalyzesToPositions(a, "シニアソフトウェアエンジニア",
new String[] { "シニア",
- "シニアソフトウェアエンジニア",
+ "シニアソフトウェアエンジニア", // zero pos inc
"ソフトウェア",
"エンジニア" },
new int[] { 1, 0, 1, 1},
new int[] { 1, 3, 1, 1}
);
+ // Senior project manager: also tests katakana spelling variation stemming
+ assertAnalyzesToPositions(a, "シニアプロジェクトマネージャー",
+ new String[] { "シニア",
+ "シニアプロジェクトマネージャ", // trailing ー removed by stemming, zero pos inc
+ "プロジェクト",
+ "マネージャ"}, // trailing ー removed by stemming
+ new int[]{1, 0, 1, 1},
+ new int[]{1, 3, 1, 1}
+ );
+
// Kansai International Airport:
assertAnalyzesToPositions(a, "関西国際空港",
new String[] { "関西",
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java (revision 0)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java (revision 0)
@@ -0,0 +1,71 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * Tests for {@link org.apache.lucene.analysis.kuromoji.KuromojiKatakanaStemFilter}
+ */
+public class TestKuromojiKatakanaStemFilter extends BaseTokenStreamTestCase {
+ private Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ // Use a MockTokenizer here since this filter doesn't really depend on Kuromoji
+ Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(source, new KuromojiKatakanaStemFilter(source));
+ }
+ };
+
+ /**
+ * Test a few common katakana spelling variations.
+ *
+ * English translations are as follows:
+ *
+ * - copy
+ * - coffee
+ * - taxi
+ * - party
+ * - party (without long sound)
+ * - center
+ *
+ * Note that we remove a long sound in the case of "coffee" that is required.
+ *
+ */
+ public void testStemVariants() throws IOException {
+ assertAnalyzesTo(analyzer, "コピー コーヒー タクシー パーティー パーティ センター",
+ new String[] { "コピー", "コーヒ", "タクシ", "パーティ", "パーティ", "センタ" },
+ new int[] { 0, 4, 9, 14, 20, 25 },
+ new int[] { 3, 8, 13, 19, 24, 29 });
+ }
+
+ public void testUnsupportedHalfWidthVariants() throws IOException {
+ // The below result is expected since only full-width katakana is supported
+ assertAnalyzesTo(analyzer, "タクシー", new String[] { "タクシー" });
+ }
+
+ public void testRandomData() throws IOException {
+ checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+ }
+}
Property changes on: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiKatakanaStemFilter.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (revision 1303738)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiAnalyzer.java (working copy)
@@ -91,6 +91,7 @@
stream = new KuromojiPartOfSpeechStopFilter(true, stream, stoptags);
stream = new CJKWidthFilter(stream);
stream = new StopFilter(matchVersion, stream, stopwords);
+ stream = new KuromojiKatakanaStemFilter(stream);
stream = new LowerCaseFilter(matchVersion, stream);
return new TokenStreamComponents(tokenizer, stream);
}
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java (revision 0)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java (revision 0)
@@ -0,0 +1,98 @@
+package org.apache.lucene.analysis.kuromoji;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+import java.io.IOException;
+
+/**
+ * A {@link TokenFilter} that normalizes common katakana spelling variations
+ * ending in a long sound character by removing this character (U+30FC). Only
+ * katakana words longer than a minimum length are stemmed (default is four).
+ *
+ * Note that only full-width katakana characters are supported. Please use a
+ * {@link org.apache.lucene.analysis.cjk.CJKWidthFilter} to convert half-width
+ * katakana to full-width before using this filter.
+ *
+ *
+ * In order to prevent terms from being stemmed, use an instance of
+ * {@link org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter}
+ * or a custom {@link TokenFilter} that sets the {@link KeywordAttribute}
+ * before this {@link TokenStream}.
+ *
+ */
+
+public final class KuromojiKatakanaStemFilter extends TokenFilter {
+ public final static int DEFAULT_MINIMUM_LENGTH = 4;
+ private final static char HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK = '\u30fc';
+
+ private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
+ private final int minimumKatakanaLength;
+
+ public KuromojiKatakanaStemFilter(TokenStream input, int minimumLength) {
+ super(input);
+ this.minimumKatakanaLength = minimumLength;
+ }
+
+ public KuromojiKatakanaStemFilter(TokenStream input) {
+ this(input, DEFAULT_MINIMUM_LENGTH);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (!keywordAttr.isKeyword()) {
+ termAttr.setLength(stem(termAttr.buffer(), termAttr.length()));
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ private int stem(char[] term, int length) {
+ if (length < minimumKatakanaLength) {
+ return length;
+ }
+
+ if (! isKatakana(term, length)) {
+ return length;
+ }
+
+ if (term[length - 1] == HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK) {
+ return length - 1;
+ }
+
+ return length;
+ }
+
+ private boolean isKatakana(char[] term, int length) {
+ for (int i = 0; i < length; i++) {
+ // NOTE: Test only identifies full-width characters -- half-widths are supported
+ if (Character.UnicodeBlock.of(term[i]) != Character.UnicodeBlock.KATAKANA) {
+ return false;
+ }
+ }
+ return true;
+ }
+}
Property changes on: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiKatakanaStemFilter.java
___________________________________________________________________
Added: svn:eol-style
+ native