Index: solr/core/src/java/org/apache/solr/analysis/KuromojiReadingFormFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/KuromojiReadingFormFilterFactory.java (revision 0) +++ solr/core/src/java/org/apache/solr/analysis/KuromojiReadingFormFilterFactory.java (revision 0) @@ -0,0 +1,50 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.kuromoji.KuromojiReadingFormFilter; + +import java.util.Map; + +/** + * Factory for {@link KuromojiReadingFormFilter}. + *
+ * <fieldType name="text_ja" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.KuromojiTokenizerFactory"/>
+ *     <filter class="solr.KuromojiReadingFormFilterFactory"
+ *             useRomaji="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * 
+ */ +public class KuromojiReadingFormFilterFactory extends BaseTokenFilterFactory { + private static final String ROMAJI_PARAM = "useRomaji"; + private boolean useRomaji; + + @Override + public void init(Map args) { + super.init(args); + useRomaji = getBoolean(ROMAJI_PARAM, false); + } + + public TokenStream create(TokenStream input) { + return new KuromojiReadingFormFilter(input, useRomaji); + } +} Property changes on: solr/core/src/java/org/apache/solr/analysis/KuromojiReadingFormFilterFactory.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java =================================================================== --- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java (revision 0) +++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java (revision 0) @@ -0,0 +1,64 @@ +package org.apache.lucene.analysis.kuromoji; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; + +import java.io.IOException; +import java.io.Reader; + +/** + * Tests for {@link TestKuromojiReadingFormFilter} + */ +public class TestKuromojiReadingFormFilter extends BaseTokenStreamTestCase { + private Analyzer katakanaAnalyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.Mode.SEARCH); + return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer, false)); + } + }; + + private Analyzer romajiAnalyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = new KuromojiTokenizer(reader, null, true, KuromojiTokenizer.Mode.SEARCH); + return new TokenStreamComponents(tokenizer, new KuromojiReadingFormFilter(tokenizer, true)); + } + }; + + + public void testKatakanaReadings() throws IOException { + assertAnalyzesTo(katakanaAnalyzer, "今夜はロバート先生と話した", + new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" } + ); + } + + public void testRomajiReadings() throws IOException { + assertAnalyzesTo(romajiAnalyzer, "今夜はロバート先生と話した", + new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" } + ); + } + + public void testRandomData() throws IOException { + checkRandomData(random, katakanaAnalyzer, 1000*RANDOM_MULTIPLIER); + checkRandomData(random, romajiAnalyzer, 1000*RANDOM_MULTIPLIER); + } +} Property changes on: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiReadingFormFilter.java ___________________________________________________________________ Added: svn:eol-style + native Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiReadingFormFilter.java =================================================================== --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiReadingFormFilter.java (revision 0) +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiReadingFormFilter.java (revision 0) @@ -0,0 +1,65 @@ +package org.apache.lucene.analysis.kuromoji; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.kuromoji.tokenattributes.ReadingAttribute; +import org.apache.lucene.analysis.kuromoji.util.ToStringUtil; + +import java.io.IOException; + +/** + * A {@link org.apache.lucene.analysis.TokenFilter} that replaces the term + * attribute with the reading of a token in either katakana or romaji form. + * The default reading form is katakana. + */ + +public final class KuromojiReadingFormFilter extends TokenFilter { + private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class); + private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class); + + private boolean useRomaji; + + public KuromojiReadingFormFilter(TokenStream input, boolean useRomaji) { + super(input); + this.useRomaji = useRomaji; + } + + public KuromojiReadingFormFilter(TokenStream input) { + this(input, false); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + String reading = readingAttr.getReading(); + if (reading != null) { + if (useRomaji) { + termAttr.setEmpty().append(ToStringUtil.getRomanization(reading)); + } else { + termAttr.setEmpty().append(reading); + } + } + return true; + } else { + return false; + } + } +} Property changes on: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiReadingFormFilter.java ___________________________________________________________________ Added: svn:eol-style + native