Index: . =================================================================== --- . (revision 1365483) +++ . (working copy) Property changes on: . ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510:r1364862-1365496 Index: dev-tools =================================================================== --- dev-tools (revision 1365483) +++ dev-tools (working copy) Property changes on: dev-tools ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/dev-tools:r1364862-1365496 Index: dev-tools/eclipse/dot.classpath =================================================================== --- dev-tools/eclipse/dot.classpath (revision 1365483) +++ dev-tools/eclipse/dot.classpath (working copy) @@ -17,25 +17,28 @@ - + - + - + + - + - + + + Index: lucene =================================================================== --- lucene (revision 1365483) +++ lucene (working copy) Property changes on: lucene ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/lucene:r1364862-1365496 Index: lucene/analysis =================================================================== --- lucene/analysis (revision 1365483) +++ lucene/analysis (working copy) Property changes on: lucene/analysis ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/lucene/analysis:r1364862-1365496 Index: lucene/analysis/common =================================================================== --- lucene/analysis/common (revision 1365483) +++ lucene/analysis/common (working copy) Property changes on: lucene/analysis/common ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/lucene/analysis/common:r1364862-1365496 Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java (working copy) @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.ar; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + + +/** + * Factory for {@link ArabicNormalizationFilter}. + *
+ * <fieldType name="text_arnormal" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.ArabicNormalizationFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class ArabicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + + public ArabicNormalizationFilter create(TokenStream input) { + return new ArabicNormalizationFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizationFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java (working copy) @@ -0,0 +1,43 @@ +package org.apache.lucene.analysis.ar; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ar.ArabicStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + + +/** + * Factory for {@link ArabicStemFilter}. + *
+ * <fieldType name="text_arstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.ArabicNormalizationFilterFactory"/>
+ *     <filter class="solr.ArabicStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class ArabicStemFilterFactory extends TokenFilterFactory { + + + public ArabicStemFilter create(TokenStream input) { + return new ArabicStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.bg; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.bg.BulgarianStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link BulgarianStemFilter}. + *
+ * <fieldType name="text_bgstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.BulgarianStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class BulgarianStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new BulgarianStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java (working copy) @@ -0,0 +1,41 @@ +package org.apache.lucene.analysis.br; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.br.BrazilianStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link BrazilianStemFilter}. + *
+ * <fieldType name="text_brstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.BrazilianStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class BrazilianStemFilterFactory extends TokenFilterFactory { + public BrazilianStemFilter create(TokenStream in) { + return new BrazilianStemFilter(in); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java (working copy) @@ -0,0 +1,70 @@ +package org.apache.lucene.analysis.charfilter; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; +import org.apache.lucene.analysis.util.CharFilterFactory; + +import java.io.Reader; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** +* Factory for {@link HTMLStripCharFilter}. + *
+ * <fieldType name="text_html" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title" />
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ + public class HTMLStripCharFilterFactory extends CharFilterFactory { + + Set escapedTags = null; + Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+"); + + public HTMLStripCharFilter create(Reader input) { + HTMLStripCharFilter charFilter; + if (null == escapedTags) { + charFilter = new HTMLStripCharFilter(input); + } else { + charFilter = new HTMLStripCharFilter(input, escapedTags); + } + return charFilter; + } + + @Override + public void init(Map args) { + super.init(args); + String escapedTagsArg = args.get("escapedTags"); + if (null != escapedTagsArg) { + Matcher matcher = TAG_NAME_PATTERN.matcher(escapedTagsArg); + while (matcher.find()) { + if (null == escapedTags) { + escapedTags = new HashSet(); + } + escapedTags.add(matcher.group(0)); + } + } + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java (working copy) @@ -0,0 +1,140 @@ +package org.apache.lucene.analysis.charfilter; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.charfilter.MappingCharFilter; +import org.apache.lucene.analysis.charfilter.NormalizeCharMap; +import org.apache.lucene.analysis.util.*; + +/** + * Factory for {@link MappingCharFilter}. + *
+ * <fieldType name="text_map" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <charFilter class="solr.MappingCharFilterFactory" mapping="mapping.txt"/>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * + * @since Solr 1.4 + * + */ +public class MappingCharFilterFactory extends CharFilterFactory implements + ResourceLoaderAware, MultiTermAwareComponent { + + protected NormalizeCharMap normMap; + private String mapping; + + public void inform(ResourceLoader loader) { + mapping = args.get( "mapping" ); + + if( mapping != null ){ + List wlist = null; + try{ + File mappingFile = new File( mapping ); + if( mappingFile.exists() ){ + wlist = loader.getLines( mapping ); + } + else{ + List files = splitFileNames( mapping ); + wlist = new ArrayList(); + for( String file : files ){ + List lines = loader.getLines( file.trim() ); + wlist.addAll( lines ); + } + } + } + catch( IOException e ){ + throw new InitializationException("IOException thrown while loading mappings", e); + } + final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); + parseRules( wlist, builder ); + normMap = builder.build(); + if (normMap.map == null) { + // if the inner FST is null, it means it accepts nothing (e.g. the file is empty) + // so just set the whole map to null + normMap = null; + } + } + } + + public Reader create(Reader input) { + // if the map is null, it means there's actually no mappings... just return the original stream + // as there is nothing to do here. + return normMap == null ? input : new MappingCharFilter(normMap,input); + } + + // "source" => "target" + static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" ); + + protected void parseRules( List rules, NormalizeCharMap.Builder builder ){ + for( String rule : rules ){ + Matcher m = p.matcher( rule ); + if( !m.find() ) + throw new InitializationException("Invalid Mapping Rule : [" + rule + "], file = " + mapping); + builder.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) ); + } + } + + char[] out = new char[256]; + + protected String parseString( String s ){ + int readPos = 0; + int len = s.length(); + int writePos = 0; + while( readPos < len ){ + char c = s.charAt( readPos++ ); + if( c == '\\' ){ + if( readPos >= len ) + throw new InitializationException("Invalid escaped char in [" + s + "]"); + c = s.charAt( readPos++ ); + switch( c ) { + case '\\' : c = '\\'; break; + case '"' : c = '"'; break; + case 'n' : c = '\n'; break; + case 't' : c = '\t'; break; + case 'r' : c = '\r'; break; + case 'b' : c = '\b'; break; + case 'f' : c = '\f'; break; + case 'u' : + if( readPos + 3 >= len ) + throw new InitializationException("Invalid escaped char in [" + s + "]"); + c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 ); + readPos += 4; + break; + } + } + out[writePos++] = c; + } + return new String( out, 0, writePos ); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java (working copy) @@ -0,0 +1,65 @@ +package org.apache.lucene.analysis.cjk; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cjk.CJKBigramFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link CJKBigramFilter}. + *
+ * <fieldType name="text_cjk" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.CJKWidthFilterFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.CJKBigramFilterFactory" 
+ *       han="true" hiragana="true" 
+ *       katakana="true" hangul="true" />
+ *   </analyzer>
+ * </fieldType>
+ */ +public class CJKBigramFilterFactory extends TokenFilterFactory { + int flags; + + @Override + public void init(Map args) { + super.init(args); + flags = 0; + if (getBoolean("han", true)) { + flags |= CJKBigramFilter.HAN; + } + if (getBoolean("hiragana", true)) { + flags |= CJKBigramFilter.HIRAGANA; + } + if (getBoolean("katakana", true)) { + flags |= CJKBigramFilter.KATAKANA; + } + if (getBoolean("hangul", true)) { + flags |= CJKBigramFilter.HANGUL; + } + } + + @Override + public TokenStream create(TokenStream input) { + return new CJKBigramFilter(input, flags); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java (working copy) @@ -0,0 +1,50 @@ +package org.apache.lucene.analysis.cjk; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cjk.CJKWidthFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link CJKWidthFilter}. + *
+ * <fieldType name="text_cjk" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.CJKWidthFilterFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.CJKBigramFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ + +public class CJKWidthFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + + @Override + public TokenStream create(TokenStream input) { + return new CJKWidthFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java (working copy) @@ -0,0 +1,83 @@ +package org.apache.lucene.analysis.commongrams; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.commongrams.CommonGramsFilter; +import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.util.*; + +/** + * Constructs a {@link CommonGramsFilter}. + *
+ * <fieldType name="text_cmmngrms" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.CommonGramsFilterFactory" words="commongramsstopwords.txt" ignoreCase="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ + +/* + * This is pretty close to a straight copy from StopFilterFactory + */ +public class CommonGramsFilterFactory extends TokenFilterFactory implements + ResourceLoaderAware { + + public void inform(ResourceLoader loader) { + String commonWordFiles = args.get("words"); + ignoreCase = getBoolean("ignoreCase", false); + + if (commonWordFiles != null) { + try { + if ("snowball".equalsIgnoreCase(args.get("format"))) { + commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase); + } else { + commonWords = getWordSet(loader, commonWordFiles, ignoreCase); + } + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading common word file", e); + } + } else { + commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + } + } + + //Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095 + private CharArraySet commonWords; + private boolean ignoreCase; + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public CharArraySet getCommonWords() { + return commonWords; + } + + public CommonGramsFilter create(TokenStream input) { + CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords); + return commonGrams; + } +} + + + \ No newline at end of file Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java (working copy) @@ -0,0 +1,95 @@ +package org.apache.lucene.analysis.commongrams; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.commongrams.CommonGramsFilter; +import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter; +import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.core.StopFilterFactory; +import org.apache.lucene.analysis.util.*; + +/** + * Construct {@link CommonGramsQueryFilter}. + * + * This is pretty close to a straight copy from {@link StopFilterFactory}. + * + *
+ * <fieldType name="text_cmmngrmsqry" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.CommonGramsQueryFilterFactory" words="commongramsquerystopwords.txt" ignoreCase="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class CommonGramsQueryFilterFactory extends TokenFilterFactory + implements ResourceLoaderAware { + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + + public void inform(ResourceLoader loader) { + String commonWordFiles = args.get("words"); + ignoreCase = getBoolean("ignoreCase", false); + + if (commonWordFiles != null) { + try { + if ("snowball".equalsIgnoreCase(args.get("format"))) { + commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase); + } else { + commonWords = getWordSet(loader, commonWordFiles, ignoreCase); + } + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading common word file", e); + } + } else { + commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; + } + } + + // Force the use of a char array set, as it is the most performant, although + // this may break things if Lucene ever goes away from it. See SOLR-1095 + private CharArraySet commonWords; + + private boolean ignoreCase; + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public CharArraySet getCommonWords() { + return commonWords; + } + + /** + * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter + */ + public CommonGramsQueryFilter create(TokenStream input) { + CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords); + CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter( + commonGrams); + return commonGramsQuery; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/commongrams/CommonGramsQueryFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (revision 1365483) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (working copy) @@ -57,6 +57,9 @@ */ public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary) { super(matchVersion, input, dictionary); + if (dictionary == null) { + throw new IllegalArgumentException("dictionary cannot be null"); + } } /** @@ -83,6 +86,9 @@ public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); + if (dictionary == null) { + throw new IllegalArgumentException("dictionary cannot be null"); + } } @Override Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java (working copy) @@ -0,0 +1,71 @@ +package org.apache.lucene.analysis.compound; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.*; +import org.apache.lucene.analysis.TokenStream; + +import java.util.Map; +import java.io.IOException; + +/** + * Factory for {@link DictionaryCompoundWordTokenFilter}. + *
+ * <fieldType name="text_dictcomp" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.DictionaryCompoundWordTokenFilterFactory" dictionary="dictionary.txt"
+ *     	     minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="true"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + private CharArraySet dictionary; + private String dictFile; + private int minWordSize; + private int minSubwordSize; + private int maxSubwordSize; + private boolean onlyLongestMatch; + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + dictFile = args.get("dictionary"); + if (null == dictFile) { + throw new InitializationException("Missing required parameter: dictionary"); + } + + minWordSize= getInt("minWordSize",CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE); + minSubwordSize= getInt("minSubwordSize",CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE); + maxSubwordSize= getInt("maxSubwordSize",CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE); + onlyLongestMatch = getBoolean("onlyLongestMatch",true); + } + public void inform(ResourceLoader loader) { + try { + dictionary = super.getWordSet(loader, dictFile, false); + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading dictionary", e); + } + } + public TokenStream create(TokenStream input) { + // if the dictionary is null, it means it was empty + return dictionary == null ? input : new DictionaryCompoundWordTokenFilter(luceneMatchVersion,input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java (working copy) @@ -0,0 +1,109 @@ +package org.apache.lucene.analysis.compound; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase; +import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; +import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; +import org.apache.lucene.analysis.util.*; +import org.apache.lucene.util.IOUtils; + +import java.util.Map; +import java.io.InputStream; +import org.xml.sax.InputSource; + +/** + * Factory for {@link HyphenationCompoundWordTokenFilter}. + *

+ * This factory accepts the following parameters: + *

    + *
  • hyphenator (mandatory): path to the FOP xml hyphenation pattern. + * See http://offo.sourceforge.net/hyphenation/. + *
  • encoding (optional): encoding of the xml hyphenation file. defaults to UTF-8. + *
  • dictionary (optional): dictionary of words. defaults to no dictionary. + *
  • minWordSize (optional): minimal word length that gets decomposed. defaults to 5. + *
  • minSubwordSize (optional): minimum length of subwords. defaults to 2. + *
  • maxSubwordSize (optional): maximum length of subwords. defaults to 15. + *
  • onlyLongestMatch (optional): if true, adds only the longest matching subword + * to the stream. defaults to false. + *
+ *

+ *

+ * <fieldType name="text_hyphncomp" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.HyphenationCompoundWordTokenFilterFactory" hyphenator="hyphenator.xml" encoding="UTF-8"
+ *     	     dictionary="dictionary.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @see HyphenationCompoundWordTokenFilter + */ +public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + private CharArraySet dictionary; + private HyphenationTree hyphenator; + private String dictFile; + private String hypFile; + private String encoding; + private int minWordSize; + private int minSubwordSize; + private int maxSubwordSize; + private boolean onlyLongestMatch; + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + dictFile = args.get("dictionary"); + if (args.containsKey("encoding")) + encoding = args.get("encoding"); + hypFile = args.get("hyphenator"); + if (null == hypFile) { + throw new InitializationException("Missing required parameter: hyphenator"); + } + + minWordSize = getInt("minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE); + minSubwordSize = getInt("minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE); + maxSubwordSize = getInt("maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE); + onlyLongestMatch = getBoolean("onlyLongestMatch", false); + } + + public void inform(ResourceLoader loader) { + InputStream stream = null; + try { + if (dictFile != null) // the dictionary can be empty. + dictionary = getWordSet(loader, dictFile, false); + // TODO: Broken, because we cannot resolve real system id + // ResourceLoader should also supply method like ClassLoader to get resource URL + stream = loader.openResource(hypFile); + final InputSource is = new InputSource(stream); + is.setEncoding(encoding); // if it's null let xml parser decide + is.setSystemId(hypFile); + hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); + } catch (Exception e) { // TODO: getHyphenationTree really shouldn't throw "Exception" + throw new InitializationException("Exception thrown while loading dictionary and hyphenation file", e); + } finally { + IOUtils.closeWhileHandlingException(stream); + } + } + + public HyphenationCompoundWordTokenFilter create(TokenStream input) { + return new HyphenationCompoundWordTokenFilter(luceneMatchVersion, input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java (working copy) @@ -0,0 +1,39 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; + +/** + * Factory for {@link KeywordTokenizer}. + *
+ * <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.KeywordTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class KeywordTokenizerFactory extends TokenizerFactory { + public KeywordTokenizer create(Reader input) { + return new KeywordTokenizer(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizerFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java (working copy) @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.core.LetterTokenizer; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; +import java.util.Map; + +/** + * Factory for {@link LetterTokenizer}. + *
+ * <fieldType name="text_letter" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.LetterTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class LetterTokenizerFactory extends TokenizerFactory { + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + + public LetterTokenizer create(Reader input) { + return new LetterTokenizer(luceneMatchVersion, input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizerFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java (working copy) @@ -0,0 +1,54 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link LowerCaseFilter}. + *
+ * <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class LowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + + public LowerCaseFilter create(TokenStream input) { + return new LowerCaseFilter(luceneMatchVersion,input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java (working copy) @@ -0,0 +1,56 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.core.LowerCaseTokenizer; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; +import java.util.Map; + +/** + * Factory for {@link LowerCaseTokenizer}. + *
+ * <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.LowerCaseTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class LowerCaseTokenizerFactory extends TokenizerFactory implements MultiTermAwareComponent { + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + + public LowerCaseTokenizer create(Reader input) { + return new LowerCaseTokenizer(luceneMatchVersion,input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + LowerCaseFilterFactory filt = new LowerCaseFilterFactory(); + filt.setLuceneMatchVersion(luceneMatchVersion); + filt.init(args); + return filt; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizerFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java (working copy) @@ -0,0 +1,91 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.*; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.core.StopFilter; + +import java.util.Map; +import java.io.IOException; + +/** + * Factory for {@link StopFilter}. + *
+ * <fieldType name="text_stop" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.StopFilterFactory" ignoreCase="true"
+ *             words="stopwords.txt" enablePositionIncrements="true"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class StopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + + @Override + public void inform(ResourceLoader loader) { + String stopWordFiles = args.get("words"); + ignoreCase = getBoolean("ignoreCase",false); + enablePositionIncrements = getBoolean("enablePositionIncrements",false); + + if (stopWordFiles != null) { + try { + if ("snowball".equalsIgnoreCase(args.get("format"))) { + stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase); + } else { + stopWords = getWordSet(loader, stopWordFiles, ignoreCase); + } + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading stopwords", e); + } + } else { + stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); + } + } + + private CharArraySet stopWords; + private boolean ignoreCase; + private boolean enablePositionIncrements; + + public boolean isEnablePositionIncrements() { + return enablePositionIncrements; + } + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public CharArraySet getStopWords() { + return stopWords; + } + + @Override + public TokenStream create(TokenStream input) { + StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords); + stopFilter.setEnablePositionIncrements(enablePositionIncrements); + return stopFilter; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java (working copy) @@ -0,0 +1,84 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.TypeTokenFilter; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.io.IOException; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Factory class for {@link TypeTokenFilter}. + *
+ * <fieldType name="chars" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.TypeTokenFilterFactory" types="stoptypes.txt"
+ *                   enablePositionIncrements="true" useWhiteList="false"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class TypeTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + + @Override + public void inform(ResourceLoader loader) { + String stopTypesFiles = args.get("types"); + enablePositionIncrements = getBoolean("enablePositionIncrements", false); + useWhitelist = getBoolean("useWhitelist", false); + if (stopTypesFiles != null) { + try { + List files = splitFileNames(stopTypesFiles); + if (files.size() > 0) { + stopTypes = new HashSet(); + for (String file : files) { + List typesLines = loader.getLines(file.trim()); + stopTypes.addAll(typesLines); + } + } + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading types", e); + } + } else { + throw new InitializationException("Missing required parameter: types."); + } + } + + private boolean useWhitelist; + private Set stopTypes; + private boolean enablePositionIncrements; + + public boolean isEnablePositionIncrements() { + return enablePositionIncrements; + } + + public Set getStopTypes() { + return stopTypes; + } + + @Override + public TokenStream create(TokenStream input) { + return new TypeTokenFilter(enablePositionIncrements, input, stopTypes, useWhitelist); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/TypeTokenFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java (working copy) @@ -0,0 +1,46 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; +import java.util.Map; + +/** + * Factory for {@link WhitespaceTokenizer}. + *
+ * <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class WhitespaceTokenizerFactory extends TokenizerFactory { + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + + public WhitespaceTokenizer create(Reader input) { + return new WhitespaceTokenizer(luceneMatchVersion,input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilterFactory.java (working copy) @@ -0,0 +1,39 @@ +package org.apache.lucene.analysis.cz; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cz.CzechStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link CzechStemFilter}. + *
+ * <fieldType name="text_czstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.CzechStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class CzechStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new CzechStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.de; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.de.GermanLightStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link GermanLightStemFilter}. + *
+ * <fieldType name="text_delgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.GermanLightStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class GermanLightStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new GermanLightStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.de; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.de.GermanMinimalStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link GermanMinimalStemFilter}. + *
+ * <fieldType name="text_deminstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.GermanMinimalStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class GermanMinimalStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new GermanMinimalStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilterFactory.java (working copy) @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.de; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.de.GermanNormalizationFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link GermanNormalizationFilter}. + *
+ * <fieldType name="text_denorm" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.GermanNormalizationFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class GermanNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + + public TokenStream create(TokenStream input) { + return new GermanNormalizationFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanNormalizationFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanStemFilterFactory.java (working copy) @@ -0,0 +1,41 @@ +package org.apache.lucene.analysis.de; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.de.GermanStemFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link GermanStemFilter}. + *
+ * <fieldType name="text_destem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.GermanStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class GermanStemFilterFactory extends TokenFilterFactory { + public GermanStemFilter create(TokenStream in) { + return new GermanStemFilter(in); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilterFactory.java (working copy) @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.el; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.el.GreekLowerCaseFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link GreekLowerCaseFilter}. + *
+ * <fieldType name="text_glc" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.GreekLowerCaseFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class GreekLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + if (args.containsKey("charset")) + throw new InitializationException( + "The charset parameter is no longer supported. " + + "Please process your documents as Unicode instead."); + } + + public GreekLowerCaseFilter create(TokenStream in) { + return new GreekLowerCaseFilter(luceneMatchVersion, in); + } + + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemFilterFactory.java (working copy) @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.el; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.el.GreekStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link GreekStemFilter}. + *
+ * <fieldType name="text_gstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.GreekLowerCaseFilterFactory"/>
+ *     <filter class="solr.GreekStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class GreekStemFilterFactory extends TokenFilterFactory { + + public TokenStream create(TokenStream input) { + return new GreekStemFilter(input); + } + +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.en; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link EnglishMinimalStemFilter}. + *
+ * <fieldType name="text_enminstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.EnglishMinimalStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class EnglishMinimalStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new EnglishMinimalStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishPossessiveFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishPossessiveFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishPossessiveFilterFactory.java (working copy) @@ -0,0 +1,49 @@ +package org.apache.lucene.analysis.en; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.EnglishPossessiveFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link EnglishPossessiveFilter}. + *
+ * <fieldType name="text_enpossessive" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.EnglishPossessiveFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class EnglishPossessiveFilterFactory extends TokenFilterFactory { + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + + public TokenStream create(TokenStream input) { + return new EnglishPossessiveFilter(luceneMatchVersion, input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishPossessiveFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishPossessiveFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishPossessiveFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishPossessiveFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilterFactory.java (working copy) @@ -0,0 +1,33 @@ +package org.apache.lucene.analysis.en; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.KStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link KStemFilter} + */ +public class KStemFilterFactory extends TokenFilterFactory { + + public TokenFilter create(TokenStream input) { + return new KStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/KStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.en; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link PorterStemFilter}. + *
+ * <fieldType name="text_porterstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.PorterStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class PorterStemFilterFactory extends TokenFilterFactory { + public PorterStemFilter create(TokenStream input) { + return new PorterStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/en/PorterStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.es; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.es.SpanishLightStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link SpanishLightStemFilter}. + *
+ * <fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.SpanishLightStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class SpanishLightStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new SpanishLightStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilterFactory.java (working copy) @@ -0,0 +1,50 @@ +package org.apache.lucene.analysis.fa; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.fa.PersianCharFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; + +/** + * Factory for {@link PersianCharFilter}. + *
+ * <fieldType name="text_fa" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <charFilter class="solr.PersianCharFilterFactory"/>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class PersianCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent { + + @Override + public CharFilter create(Reader input) { + return new PersianCharFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilterFactory.java (working copy) @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.fa; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.fa.PersianNormalizationFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link PersianNormalizationFilter}. + *
+ * <fieldType name="text_fanormal" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <charFilter class="solr.PersianCharFilterFactory"/>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.PersianNormalizationFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class PersianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + public PersianNormalizationFilter create(TokenStream input) { + return new PersianNormalizationFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianNormalizationFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.fi; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fi.FinnishLightStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link FinnishLightStemFilter}. + *
+ * <fieldType name="text_filgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.FinnishLightStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class FinnishLightStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new FinnishLightStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilterFactory.java (working copy) @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.fr; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.*; + +import java.io.IOException; +import org.apache.lucene.analysis.TokenStream; + +/** + * Factory for {@link ElisionFilter}. + *
+ * <fieldType name="text_elsn" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.ElisionFilterFactory" 
+ *       articles="stopwordarticles.txt" ignoreCase="true"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class ElisionFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + + private CharArraySet articles; + + public void inform(ResourceLoader loader) { + String articlesFile = args.get("articles"); + boolean ignoreCase = getBoolean("ignoreCase", false); + + if (articlesFile != null) { + try { + articles = getWordSet(loader, articlesFile, ignoreCase); + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading articles", e); + } + } + } + + public ElisionFilter create(TokenStream input) { + assureMatchVersion(); + return articles == null ? new ElisionFilter(luceneMatchVersion,input) : + new ElisionFilter(luceneMatchVersion,input,articles); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/ElisionFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilterFactory.java (working copy) @@ -0,0 +1,41 @@ +package org.apache.lucene.analysis.fr; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fr.FrenchLightStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link FrenchLightStemFilter}. + *
+ * <fieldType name="text_frlgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.ElisionFilterFactory"/>
+ *     <filter class="solr.FrenchLightStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class FrenchLightStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new FrenchLightStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,41 @@ +package org.apache.lucene.analysis.fr; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link FrenchMinimalStemFilter}. + *
+ * <fieldType name="text_frminstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.ElisionFilterFactory"/>
+ *     <filter class="solr.FrenchMinimalStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class FrenchMinimalStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new FrenchMinimalStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilterFactory.java (working copy) @@ -0,0 +1,49 @@ +package org.apache.lucene.analysis.ga; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ga.IrishLowerCaseFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link IrishLowerCaseFilter}. + *
+ * <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.IrishLowerCaseFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class IrishLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + + @Override + public TokenStream create(TokenStream input) { + return new IrishLowerCaseFilter(input); + } + + // this will 'mostly work', except for special cases, just like most other filters + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishLowerCaseFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.gl; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.gl.GalicianMinimalStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link GalicianMinimalStemFilter}. + *
+ * <fieldType name="text_glplural" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.GalicianMinimalStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class GalicianMinimalStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new GalicianMinimalStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.gl; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.gl.GalicianStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link GalicianStemFilter}. + *
+ * <fieldType name="text_glstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.GalicianStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class GalicianStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new GalicianStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilterFactory.java (working copy) @@ -0,0 +1,46 @@ +package org.apache.lucene.analysis.hi; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.hi.HindiNormalizationFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link HindiNormalizationFilter}. + *
+ * <fieldType name="text_hinormal" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.HindiNormalizationFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class HindiNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + public TokenStream create(TokenStream input) { + return new HindiNormalizationFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizationFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilterFactory.java (working copy) @@ -0,0 +1,39 @@ +package org.apache.lucene.analysis.hi; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.hi.HindiStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link HindiStemFilter}. + *
+ * <fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.HindiStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class HindiStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new HindiStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.hu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.hu.HungarianLightStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link HungarianLightStemFilter}. + *
+ * <fieldType name="text_hulgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.HungarianLightStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class HungarianLightStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new HungarianLightStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (working copy) @@ -0,0 +1,121 @@ +package org.apache.lucene.analysis.hunspell; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.hunspell.HunspellDictionary; +import org.apache.lucene.analysis.hunspell.HunspellStemFilter; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.util.IOUtils; + +/** + * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}. + * Example config for British English including a custom dictionary, case insensitive matching: + *
+ * <filter class="solr.HunspellStemFilterFactory"
+ *    dictionary="en_GB.dic,my_custom.dic"
+ *    affix="en_GB.aff"
+ *    ignoreCase="true" />
+ * Both parameters dictionary and affix are mandatory. + *
+ * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false. + *
+ * The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true. + * If strict an error while reading an affix rule causes a ParseException, otherwise is ignored. + *
+ * Dictionaries for many languages are available through the OpenOffice project. + * + * See http://wiki.apache.org/solr/Hunspell + */ +public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + + private static final String PARAM_DICTIONARY = "dictionary"; + private static final String PARAM_AFFIX = "affix"; + private static final String PARAM_IGNORE_CASE = "ignoreCase"; + private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing"; + private static final String TRUE = "true"; + private static final String FALSE = "false"; + + private HunspellDictionary dictionary; + private boolean ignoreCase = false; + + /** + * Loads the hunspell dictionary and affix files defined in the configuration + * + * @param loader ResourceLoader used to load the files + */ + public void inform(ResourceLoader loader) { + assureMatchVersion(); + String dictionaryArg = args.get(PARAM_DICTIONARY); + if (dictionaryArg == null) { + throw new InitializationException("Parameter " + PARAM_DICTIONARY + " is mandatory."); + } + String dictionaryFiles[] = args.get(PARAM_DICTIONARY).split(","); + String affixFile = args.get(PARAM_AFFIX); + String pic = args.get(PARAM_IGNORE_CASE); + if(pic != null) { + if(pic.equalsIgnoreCase(TRUE)) ignoreCase = true; + else if(pic.equalsIgnoreCase(FALSE)) ignoreCase = false; + else throw new InitializationException("Unknown value for " + PARAM_IGNORE_CASE + ": " + pic + ". Must be true or false"); + } + + String strictAffixParsingParam = args.get(PARAM_STRICT_AFFIX_PARSING); + boolean strictAffixParsing = true; + if(strictAffixParsingParam != null) { + if(strictAffixParsingParam.equalsIgnoreCase(FALSE)) strictAffixParsing = false; + else if(strictAffixParsingParam.equalsIgnoreCase(TRUE)) strictAffixParsing = true; + else throw new InitializationException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING + ": " + strictAffixParsingParam + ". Must be true or false"); + } + + InputStream affix = null; + List dictionaries = new ArrayList(); + + try { + dictionaries = new ArrayList(); + for (String file : dictionaryFiles) { + dictionaries.add(loader.openResource(file)); + } + affix = loader.openResource(affixFile); + + this.dictionary = new HunspellDictionary(affix, dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing); + } catch (Exception e) { + throw new InitializationException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e); + } finally { + IOUtils.closeWhileHandlingException(affix); + IOUtils.closeWhileHandlingException(dictionaries); + } + } + + /** + * Creates an instance of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter} that will filter the given + * TokenStream + * + * @param tokenStream TokenStream that will be filtered + * @return HunspellStemFilter that filters the TokenStream + */ + public TokenStream create(TokenStream tokenStream) { + return new HunspellStemFilter(tokenStream, dictionary); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java (working copy) @@ -0,0 +1,50 @@ +package org.apache.lucene.analysis.id; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.id.IndonesianStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link IndonesianStemFilter}. + *
+ * <fieldType name="text_idstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class IndonesianStemFilterFactory extends TokenFilterFactory { + private boolean stemDerivational = true; + + @Override + public void init(Map args) { + super.init(args); + stemDerivational = getBoolean("stemDerivational", true); + } + + public TokenStream create(TokenStream input) { + return new IndonesianStemFilter(input, stemDerivational); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java (working copy) @@ -0,0 +1,46 @@ +package org.apache.lucene.analysis.in; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.in.IndicNormalizationFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link IndicNormalizationFilter}. + *
+ * <fieldType name="text_innormal" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.IndicNormalizationFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class IndicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + public TokenStream create(TokenStream input) { + return new IndicNormalizationFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/in/IndicNormalizationFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.it; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.it.ItalianLightStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link ItalianLightStemFilter}. + *
+ * <fieldType name="text_itlgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.ItalianLightStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class ItalianLightStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new ItalianLightStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java (working copy) @@ -0,0 +1,39 @@ +package org.apache.lucene.analysis.lv; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.lv.LatvianStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link LatvianStemFilter}. + *
+ * <fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.LatvianStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class LatvianStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new LatvianStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java (working copy) @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.TokenStream; + +/** + * Factory for {@link ASCIIFoldingFilter}. + *
+ * <fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.ASCIIFoldingFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + public ASCIIFoldingFilter create(TokenStream input) { + return new ASCIIFoldingFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java (working copy) @@ -0,0 +1,140 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Map; +import java.util.StringTokenizer; + +/** + * Factory for {@link CapitalizationFilter}. + *

+ * The factory takes parameters:
+ * "onlyFirstWord" - should each word be capitalized or all of the words?
+ * "keep" - a keep word list. Each word that should be kept separated by whitespace.
+ * "keepIgnoreCase - true or false. If true, the keep list will be considered case-insensitive.
+ * "forceFirstLetter" - Force the first letter to be capitalized even if it is in the keep list
+ * "okPrefix" - do not change word capitalization if a word begins with something in this list. + * for example if "McK" is on the okPrefix list, the word "McKinley" should not be changed to + * "Mckinley"
+ * "minWordLength" - how long the word needs to be to get capitalization applied. If the + * minWordLength is 3, "and" > "And" but "or" stays "or"
+ * "maxWordCount" - if the token contains more then maxWordCount words, the capitalization is + * assumed to be correct.
+ * + *

+ * <fieldType name="text_cptlztn" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.CapitalizationFilterFactory" onlyFirstWord="true"
+ *     	     keep="java solr lucene" keepIgnoreCase="false"
+ *     	     okPrefix="McK McD McA"/>   
+ *   </analyzer>
+ * </fieldType>
+ * + * + * @since solr 1.3 + */ +public class CapitalizationFilterFactory extends TokenFilterFactory { + public static final String KEEP = "keep"; + public static final String KEEP_IGNORE_CASE = "keepIgnoreCase"; + public static final String OK_PREFIX = "okPrefix"; + public static final String MIN_WORD_LENGTH = "minWordLength"; + public static final String MAX_WORD_COUNT = "maxWordCount"; + public static final String MAX_TOKEN_LENGTH = "maxTokenLength"; + public static final String ONLY_FIRST_WORD = "onlyFirstWord"; + public static final String FORCE_FIRST_LETTER = "forceFirstLetter"; + + //Map keep = new HashMap(); // not synchronized because it is only initialized once + CharArraySet keep; + + Collection okPrefix = Collections.emptyList(); // for Example: McK + + int minWordLength = 0; // don't modify capitalization for words shorter then this + int maxWordCount = CapitalizationFilter.DEFAULT_MAX_WORD_COUNT; + int maxTokenLength = CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH; + boolean onlyFirstWord = true; + boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is in the keep list + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + + String k = args.get(KEEP); + if (k != null) { + StringTokenizer st = new StringTokenizer(k); + boolean ignoreCase = false; + String ignoreStr = args.get(KEEP_IGNORE_CASE); + if ("true".equalsIgnoreCase(ignoreStr)) { + ignoreCase = true; + } + keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase); + while (st.hasMoreTokens()) { + k = st.nextToken().trim(); + keep.add(k.toCharArray()); + } + } + + k = args.get(OK_PREFIX); + if (k != null) { + okPrefix = new ArrayList(); + StringTokenizer st = new StringTokenizer(k); + while (st.hasMoreTokens()) { + okPrefix.add(st.nextToken().trim().toCharArray()); + } + } + + k = args.get(MIN_WORD_LENGTH); + if (k != null) { + minWordLength = Integer.valueOf(k); + } + + k = args.get(MAX_WORD_COUNT); + if (k != null) { + maxWordCount = Integer.valueOf(k); + } + + k = args.get(MAX_TOKEN_LENGTH); + if (k != null) { + maxTokenLength = Integer.valueOf(k); + } + + k = args.get(ONLY_FIRST_WORD); + if (k != null) { + onlyFirstWord = Boolean.valueOf(k); + } + + k = args.get(FORCE_FIRST_LETTER); + if (k != null) { + forceFirstLetter = Boolean.valueOf(k); + } + } + + public CapitalizationFilter create(TokenStream input) { + return new CapitalizationFilter(input, onlyFirstWord, keep, + forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java (working copy) @@ -0,0 +1,39 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link HyphenatedWordsFilter}. + *
+ * <fieldType name="text_hyphn" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.HyphenatedWordsFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class HyphenatedWordsFilterFactory extends TokenFilterFactory { + public HyphenatedWordsFilter create(TokenStream input) { + return new HyphenatedWordsFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/HyphenatedWordsFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java (working copy) @@ -0,0 +1,96 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.*; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; + +import java.util.Map; +import java.util.Set; +import java.io.IOException; + +/** + * Factory for {@link KeepWordFilter}. + *
+ * <fieldType name="text_keepword" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false" enablePositionIncrements="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class KeepWordFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + + public void inform(ResourceLoader loader) { + String wordFiles = args.get("words"); + ignoreCase = getBoolean("ignoreCase", false); + enablePositionIncrements = getBoolean("enablePositionIncrements",false); + + if (wordFiles != null) { + try { + words = getWordSet(loader, wordFiles, ignoreCase); + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading words", e); + } + } + } + + private CharArraySet words; + private boolean ignoreCase; + private boolean enablePositionIncrements; + + /** + * Set the keep word list. + * NOTE: if ignoreCase==true, the words are expected to be lowercase + */ + public void setWords(Set words) { + this.words = new CharArraySet(luceneMatchVersion, words, ignoreCase); + } + + public void setIgnoreCase(boolean ignoreCase) { + if (words != null && this.ignoreCase != ignoreCase) { + words = new CharArraySet(luceneMatchVersion, words, ignoreCase); + } + this.ignoreCase = ignoreCase; + } + + public boolean isEnablePositionIncrements() { + return enablePositionIncrements; + } + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public CharArraySet getWords() { + return words; + } + + public TokenStream create(TokenStream input) { + // if the set is null, it means it was empty + return words == null ? input : new KeepWordFilter(enablePositionIncrements, input, words); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeepWordFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java (working copy) @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.util.*; +import org.apache.lucene.analysis.TokenStream; + +/** + * Factory for {@link KeywordMarkerFilter}. + *
+ * <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.KeywordMarkerFilterFactory" protected="protectedkeyword.txt" ignoreCase="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class KeywordMarkerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + public static final String PROTECTED_TOKENS = "protected"; + private CharArraySet protectedWords; + private boolean ignoreCase; + + public void inform(ResourceLoader loader) { + String wordFiles = args.get(PROTECTED_TOKENS); + ignoreCase = getBoolean("ignoreCase", false); + if (wordFiles != null) { + try { + protectedWords = getWordSet(loader, wordFiles, ignoreCase); + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading protected words", e); + } + } + } + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public TokenStream create(TokenStream input) { + return protectedWords == null ? input : new KeywordMarkerFilter(input, protectedWords); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/KeywordMarkerFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java (working copy) @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.LengthFilter; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link LengthFilter}. + *
+ * <fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.LengthFilterFactory" min="0" max="1" enablePositionIncrements="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class LengthFilterFactory extends TokenFilterFactory { + int min,max; + boolean enablePositionIncrements; + public static final String MIN_KEY = "min"; + public static final String MAX_KEY = "max"; + + @Override + public void init(Map args) { + super.init(args); + String minKey = args.get(MIN_KEY); + String maxKey = args.get(MAX_KEY); + if (minKey == null || maxKey == null) { + throw new InitializationException("Both " + MIN_KEY + " and " + MAX_KEY + " are mandatory"); + } + min=Integer.parseInt(minKey); + max=Integer.parseInt(maxKey); + enablePositionIncrements = getBoolean("enablePositionIncrements",false); + } + + public LengthFilter create(TokenStream input) { + return new LengthFilter(enablePositionIncrements, input,min,max); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LengthFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java (working copy) @@ -0,0 +1,57 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link LimitTokenCountFilter}. + *
+ * <fieldType name="text_lngthcnt" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="10"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class LimitTokenCountFilterFactory extends TokenFilterFactory { + + int maxTokenCount; + + @Override + public void init(Map args) { + super.init( args ); + String maxTokenCountArg = args.get("maxTokenCount"); + if (maxTokenCountArg == null) { + throw new InitializationException("maxTokenCount is mandatory."); + } + maxTokenCount = Integer.parseInt(args.get(maxTokenCountArg)); + } + + @Override + public TokenStream create(TokenStream input) { + return new LimitTokenCountFilter(input, maxTokenCount); + } + +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/LimitTokenCountFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java (working copy) @@ -0,0 +1,39 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link RemoveDuplicatesTokenFilter}. + *
+ * <fieldType name="text_rmdup" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class RemoveDuplicatesTokenFilterFactory extends TokenFilterFactory { + public RemoveDuplicatesTokenFilter create(TokenStream input) { + return new RemoveDuplicatesTokenFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java (working copy) @@ -0,0 +1,73 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; +import org.apache.lucene.analysis.util.*; + +/** + * Factory for {@link StemmerOverrideFilter}. + *
+ * <fieldType name="text_dicstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class StemmerOverrideFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + private CharArrayMap dictionary = null; + private boolean ignoreCase; + + public void inform(ResourceLoader loader) { + String dictionaryFiles = args.get("dictionary"); + ignoreCase = getBoolean("ignoreCase", false); + if (dictionaryFiles != null) { + assureMatchVersion(); + List files = splitFileNames(dictionaryFiles); + try { + if (files.size() > 0) { + dictionary = new CharArrayMap(luceneMatchVersion, + files.size() * 10, ignoreCase); + for (String file : files) { + List list = loader.getLines(file.trim()); + for (String line : list) { + String[] mapping = line.split("\t", 2); + dictionary.put(mapping[0], mapping[1]); + } + } + } + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading dictionary", e); + } + } + } + + public boolean isIgnoreCase() { + return ignoreCase; + } + + public TokenStream create(TokenStream input) { + return dictionary == null ? input : new StemmerOverrideFilter(luceneMatchVersion, input, dictionary); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/StemmerOverrideFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java (working copy) @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.TrimFilter; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link TrimFilter}. + *
+ * <fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.NGramTokenizerFactory"/>
+ *     <filter class="solr.TrimFilterFactory" updateOffsets="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @see TrimFilter + */ +public class TrimFilterFactory extends TokenFilterFactory { + + protected boolean updateOffsets = false; + + @Override + public void init(Map args) { + super.init( args ); + + String v = args.get( "updateOffsets" ); + if( v != null ) { + try { + updateOffsets = Boolean.valueOf( v ); + } + catch( Exception ex ) { + throw new InitializationException("Error reading updateOffsets value. Must be true or false.", ex); + } + } + } + + public TrimFilter create(TokenStream input) { + return new TrimFilter(input, updateOffsets); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/TrimFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java (working copy) @@ -0,0 +1,198 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; +import org.apache.lucene.analysis.util.*; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.io.IOException; + +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*; + + +/** + * Factory for {@link WordDelimiterFilter}. + *
+ * <fieldType name="text_wd" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.WordDelimiterFilterFactory" protected="protectedword.txt"
+ *             preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
+ *             catenateWords="0" catenateNumbers="0" catenateAll="0"
+ *             generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
+ *             types="wdfftypes.txt" />
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class WordDelimiterFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + public static final String PROTECTED_TOKENS = "protected"; + public static final String TYPES = "types"; + + public void inform(ResourceLoader loader) { + String wordFiles = args.get(PROTECTED_TOKENS); + if (wordFiles != null) { + try { + protectedWords = getWordSet(loader, wordFiles, false); + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading protected words", e); + } + } + String types = args.get(TYPES); + if (types != null) { + try { + List files = splitFileNames( types ); + List wlist = new ArrayList(); + for( String file : files ){ + List lines = loader.getLines( file.trim() ); + wlist.addAll( lines ); + } + typeTable = parseTypes(wlist); + } catch (IOException e) { + throw new InitializationException("IOException while loading types", e); + } + } + } + + private CharArraySet protectedWords = null; + private int flags; + byte[] typeTable = null; + + @Override + public void init(Map args) { + super.init(args); + if (getInt("generateWordParts", 1) != 0) { + flags |= GENERATE_WORD_PARTS; + } + if (getInt("generateNumberParts", 1) != 0) { + flags |= GENERATE_NUMBER_PARTS; + } + if (getInt("catenateWords", 0) != 0) { + flags |= CATENATE_WORDS; + } + if (getInt("catenateNumbers", 0) != 0) { + flags |= CATENATE_NUMBERS; + } + if (getInt("catenateAll", 0) != 0) { + flags |= CATENATE_ALL; + } + if (getInt("splitOnCaseChange", 1) != 0) { + flags |= SPLIT_ON_CASE_CHANGE; + } + if (getInt("splitOnNumerics", 1) != 0) { + flags |= SPLIT_ON_NUMERICS; + } + if (getInt("preserveOriginal", 0) != 0) { + flags |= PRESERVE_ORIGINAL; + } + if (getInt("stemEnglishPossessive", 1) != 0) { + flags |= STEM_ENGLISH_POSSESSIVE; + } + } + + public WordDelimiterFilter create(TokenStream input) { + return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, + flags, protectedWords); + } + + // source => type + private static Pattern typePattern = Pattern.compile( "(.*)\\s*=>\\s*(.*)\\s*$" ); + + // parses a list of MappingCharFilter style rules into a custom byte[] type table + private byte[] parseTypes(List rules) { + SortedMap typeMap = new TreeMap(); + for( String rule : rules ){ + Matcher m = typePattern.matcher(rule); + if( !m.find() ) + throw new InitializationException("Invalid Mapping Rule : [" + rule + "]"); + String lhs = parseString(m.group(1).trim()); + Byte rhs = parseType(m.group(2).trim()); + if (lhs.length() != 1) + throw new InitializationException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); + if (rhs == null) + throw new InitializationException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); + typeMap.put(lhs.charAt(0), rhs); + } + + // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance + byte types[] = new byte[Math.max(typeMap.lastKey()+1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; + for (int i = 0; i < types.length; i++) + types[i] = WordDelimiterIterator.getType(i); + for (Map.Entry mapping : typeMap.entrySet()) + types[mapping.getKey()] = mapping.getValue(); + return types; + } + + private Byte parseType(String s) { + if (s.equals("LOWER")) + return LOWER; + else if (s.equals("UPPER")) + return UPPER; + else if (s.equals("ALPHA")) + return ALPHA; + else if (s.equals("DIGIT")) + return DIGIT; + else if (s.equals("ALPHANUM")) + return ALPHANUM; + else if (s.equals("SUBWORD_DELIM")) + return SUBWORD_DELIM; + else + return null; + } + + char[] out = new char[256]; + + private String parseString(String s){ + int readPos = 0; + int len = s.length(); + int writePos = 0; + while( readPos < len ){ + char c = s.charAt( readPos++ ); + if( c == '\\' ){ + if( readPos >= len ) + throw new InitializationException("Invalid escaped char in [" + s + "]"); + c = s.charAt( readPos++ ); + switch( c ) { + case '\\' : c = '\\'; break; + case 'n' : c = '\n'; break; + case 't' : c = '\t'; break; + case 'r' : c = '\r'; break; + case 'b' : c = '\b'; break; + case 'f' : c = '\f'; break; + case 'u' : + if( readPos + 3 >= len ) + throw new InitializationException("Invalid escaped char in [" + s + "]"); + c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 ); + readPos += 4; + break; + } + } + out[writePos++] = c; + } + return new String( out, 0, writePos ); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java (working copy) @@ -0,0 +1,63 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Creates new instances of {@link EdgeNGramTokenFilter}. + *
+ * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.EdgeNGramFilterFactory" side="front" minGramSize="1" maxGramSize="1"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class EdgeNGramFilterFactory extends TokenFilterFactory { + private int maxGramSize = 0; + + private int minGramSize = 0; + + private String side; + + @Override + public void init(Map args) { + super.init(args); + String maxArg = args.get("maxGramSize"); + maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) + : EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); + + String minArg = args.get("minGramSize"); + minGramSize = (minArg != null ? Integer.parseInt(minArg) + : EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE); + + side = args.get("side"); + if (side == null) { + side = EdgeNGramTokenFilter.Side.FRONT.getLabel(); + } + } + + public EdgeNGramTokenFilter create(TokenStream input) { + return new EdgeNGramTokenFilter(input, side, minGramSize, maxGramSize); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java (working copy) @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; +import java.util.Map; + +/** + * Creates new instances of {@link EdgeNGramTokenizer}. + *
+ * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.EdgeNGramTokenizerFactory" side="front" minGramSize="1" maxGramSize="1"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class EdgeNGramTokenizerFactory extends TokenizerFactory { + private int maxGramSize = 0; + + private int minGramSize = 0; + + private String side; + + @Override + public void init(Map args) { + super.init(args); + String maxArg = args.get("maxGramSize"); + maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); + + String minArg = args.get("minGramSize"); + minGramSize = (minArg != null ? Integer.parseInt(minArg) : EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE); + + side = args.get("side"); + if (side == null) { + side = EdgeNGramTokenizer.Side.FRONT.getLabel(); + } + } + + public EdgeNGramTokenizer create(Reader input) { + return new EdgeNGramTokenizer(input, side, minGramSize, maxGramSize); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java (working copy) @@ -0,0 +1,57 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link NGramTokenFilter}. + *
+ * <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class NGramFilterFactory extends TokenFilterFactory { + private int maxGramSize = 0; + + private int minGramSize = 0; + + /** Initialize the n-gram min and max sizes and the side from which one should start tokenizing. */ + @Override + public void init(Map args) { + super.init(args); + String maxArg = args.get("maxGramSize"); + maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) + : NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); + + String minArg = args.get("minGramSize"); + minGramSize = (minArg != null ? Integer.parseInt(minArg) + : NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); + } + + public NGramTokenFilter create(TokenStream input) { + return new NGramTokenFilter(input, minGramSize, maxGramSize); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java (working copy) @@ -0,0 +1,56 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; +import java.util.Map; + +/** + * Factory for {@link NGramTokenizer}. + *
+ * <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="2"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class NGramTokenizerFactory extends TokenizerFactory { + private int maxGramSize = 0; + private int minGramSize = 0; + + /** Initializes the n-gram min and max sizes and the side from which one should start tokenizing. */ + @Override + public void init(Map args) { + super.init(args); + String maxArg = args.get("maxGramSize"); + maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); + + String minArg = args.get("minGramSize"); + minGramSize = (minArg != null ? Integer.parseInt(minArg) : NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); + } + + /** Creates the {@link TokenStream} of n-grams from the given {@link Reader}. */ + public NGramTokenizer create(Reader input) { + return new NGramTokenizer(input, minGramSize, maxGramSize); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizerFactory.java ___________________________________________________________________ Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java (working copy) @@ -0,0 +1,39 @@ +package org.apache.lucene.analysis.no; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.no.NorwegianLightStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link NorwegianLightStemFilter}. + *
+ * <fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.NorwegianLightStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class NorwegianLightStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new NorwegianLightStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianLightStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,39 @@ +package org.apache.lucene.analysis.no; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.no.NorwegianMinimalStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link NorwegianMinimalStemFilter}. + *
+ * <fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.NorwegianMinimalStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class NorwegianMinimalStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new NorwegianMinimalStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizerFactory.java (working copy) @@ -0,0 +1,98 @@ +package org.apache.lucene.analysis.path; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.util.Map; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.path.PathHierarchyTokenizer; +import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenizerFactory; + +/** + * Factory for {@link PathHierarchyTokenizer}. + *
+ * <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="\" replace="/"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class PathHierarchyTokenizerFactory extends TokenizerFactory { + + private char delimiter; + private char replacement; + private boolean reverse = false; + private int skip = PathHierarchyTokenizer.DEFAULT_SKIP; + + /** + * Require a configured pattern + */ + @Override + public void init(Map args){ + super.init( args ); + + String v = args.get( "delimiter" ); + if( v != null ){ + if( v.length() != 1 ){ + throw new InitializationException("delimiter should be a char. \"" + v + "\" is invalid"); + } + else{ + delimiter = v.charAt(0); + } + } + else{ + delimiter = PathHierarchyTokenizer.DEFAULT_DELIMITER; + } + + v = args.get( "replace" ); + if( v != null ){ + if( v.length() != 1 ){ + throw new InitializationException("replace should be a char. \"" + v + "\" is invalid"); + } + else{ + replacement = v.charAt(0); + } + } + else{ + replacement = delimiter; + } + + v = args.get( "reverse" ); + if( v != null ){ + reverse = "true".equals( v ); + } + + v = args.get( "skip" ); + if( v != null ){ + skip = Integer.parseInt( v ); + } + } + + public Tokenizer create(Reader input) { + if( reverse ) { + return new ReversePathHierarchyTokenizer(input, delimiter, replacement, skip); + } + return new PathHierarchyTokenizer(input, delimiter, replacement, skip); + } +} + + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizerFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilterFactory.java (working copy) @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.util.Map; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter; +import org.apache.lucene.analysis.util.CharFilterFactory; + +/** + * Factory for {@link PatternReplaceCharFilter}. + *
+ * <fieldType name="text_ptnreplace" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <charFilter class="solr.PatternReplaceCharFilterFactory" 
+ *                    pattern="([^a-z])" replacement=""/>
+ *     <tokenizer class="solr.KeywordTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * + * @since Solr 3.1 + */ +public class PatternReplaceCharFilterFactory extends CharFilterFactory { + + private Pattern p; + private String replacement; + + @Override + public void init(Map args) { + super.init( args ); + p = getPattern("pattern"); + replacement = args.get( "replacement" ); + if( replacement == null ) + replacement = ""; + // TODO: throw exception if you set maxBlockChars or blockDelimiters ? + } + + public CharFilter create(Reader input) { + return new PatternReplaceCharFilter( p, replacement, input ); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilterFactory.java (working copy) @@ -0,0 +1,72 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.pattern.PatternReplaceFilter; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +/** + * Factory for {@link PatternReplaceFilter}. + *
+ * <fieldType name="text_ptnreplace" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.KeywordTokenizerFactory"/>
+ *     <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" replacement=""
+ *             replace="all"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @see PatternReplaceFilter + */ +public class PatternReplaceFilterFactory extends TokenFilterFactory { + Pattern p; + String replacement; + boolean all = true; + + @Override + public void init(Map args) { + super.init(args); + p = getPattern("pattern"); + replacement = args.get("replacement"); + + String r = args.get("replace"); + if (null != r) { + if (r.equals("all")) { + all = true; + } else { + if (r.equals("first")) { + all = false; + } else { + throw new InitializationException + ("Configuration Error: 'replace' must be 'first' or 'all' in " + + this.getClass().getName()); + } + } + } + + } + public PatternReplaceFilter create(TokenStream input) { + return new PatternReplaceFilter(input, p, replacement, all); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizerFactory.java (working copy) @@ -0,0 +1,106 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.Map; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.pattern.PatternTokenizer; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenizerFactory; + +/** + * Factory for {@link PatternTokenizer}. + * This tokenizer uses regex pattern matching to construct distinct tokens + * for the input stream. It takes two arguments: "pattern" and "group". + *

+ *

    + *
  • "pattern" is the regular expression.
  • + *
  • "group" says which group to extract into tokens.
  • + *
+ *

+ * group=-1 (the default) is equivalent to "split". In this case, the tokens will + * be equivalent to the output from (without empty tokens): + * {@link String#split(java.lang.String)} + *

+ *

+ * Using group >= 0 selects the matching group as the token. For example, if you have:
+ *

+ *  pattern = \'([^\']+)\'
+ *  group = 0
+ *  input = aaa 'bbb' 'ccc'
+ *
+ * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input + * but using group=1, the output would be: bbb and ccc (no ' marks) + *

+ *

NOTE: This Tokenizer does not output tokens that are of zero length.

+ * + *
+ * <fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.PatternTokenizerFactory" pattern="\'([^\']+)\'" group="1"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @see PatternTokenizer + * @since solr1.2 + * + */ +public class PatternTokenizerFactory extends TokenizerFactory +{ + public static final String PATTERN = "pattern"; + public static final String GROUP = "group"; + + protected Pattern pattern; + protected int group; + + /** + * Require a configured pattern + */ + @Override + public void init(Map args) + { + super.init(args); + pattern = getPattern( PATTERN ); + + group = -1; // use 'split' + String g = args.get( GROUP ); + if( g != null ) { + try { + group = Integer.parseInt( g ); + } + catch( Exception ex ) { + throw new InitializationException("invalid group argument: " + g); + } + } + } + + /** + * Split the input using configured pattern + */ + public Tokenizer create(final Reader in) { + try { + return new PatternTokenizer(in, pattern, group); + } catch( IOException ex ) { + throw new InitializationException("IOException thrown creating PatternTokenizer instance", ex); + } + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizerFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterFactory.java (working copy) @@ -0,0 +1,86 @@ +package org.apache.lucene.analysis.payloads; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; +import org.apache.lucene.analysis.payloads.PayloadEncoder; +import org.apache.lucene.analysis.payloads.FloatEncoder; +import org.apache.lucene.analysis.payloads.IntegerEncoder; +import org.apache.lucene.analysis.payloads.IdentityEncoder; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * + * Factory for {@link DelimitedPayloadTokenFilter}. + *
+ * <fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float" delimiter="|"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * + */ +public class DelimitedPayloadTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + public static final String ENCODER_ATTR = "encoder"; + public static final String DELIMITER_ATTR = "delimiter"; + + private PayloadEncoder encoder; + private char delimiter = '|'; + + public DelimitedPayloadTokenFilter create(TokenStream input) { + return new DelimitedPayloadTokenFilter(input, delimiter, encoder); + } + + @Override + public void init(Map args) { + super.init(args); + } + + public void inform(ResourceLoader loader) { + String encoderClass = args.get(ENCODER_ATTR); + if (encoderClass == null) { + throw new InitializationException("Parameter " + ENCODER_ATTR + " is mandatory"); + } + if (encoderClass.equals("float")){ + encoder = new FloatEncoder(); + } else if (encoderClass.equals("integer")){ + encoder = new IntegerEncoder(); + } else if (encoderClass.equals("identity")){ + encoder = new IdentityEncoder(); + } else { + encoder = loader.newInstance(encoderClass, PayloadEncoder.class); + } + + String delim = args.get(DELIMITER_ATTR); + if (delim != null){ + if (delim.length() == 1) { + delimiter = delim.charAt(0); + } else{ + throw new InitializationException("Delimiter must be one character only"); + } + } + } +} \ No newline at end of file Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/DelimitedPayloadTokenFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (revision 1365483) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilter.java (working copy) @@ -40,6 +40,9 @@ public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) { super(input); + if (typeMatch == null) { + throw new IllegalArgumentException("typeMatch cannot be null"); + } //Need to encode the payload thePayload = new BytesRef(PayloadHelper.encodeFloat(payload)); this.typeMatch = typeMatch; Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterFactory.java (working copy) @@ -0,0 +1,54 @@ +package org.apache.lucene.analysis.payloads; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import java.util.Map; + +/** + * Factory for {@link NumericPayloadTokenFilter}. + *
+ * <fieldType name="text_numpayload" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.NumericPayloadTokenFilterFactory" payload="24" typeMatch="word"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class NumericPayloadTokenFilterFactory extends TokenFilterFactory { + private float payload; + private String typeMatch; + @Override + public void init(Map args) { + super.init(args); + String payloadArg = args.get("payload"); + typeMatch = args.get("typeMatch"); + if (payloadArg == null || typeMatch == null) { + throw new InitializationException("Both payload and typeMatch are required"); + } + payload = Float.parseFloat(payloadArg); + } + public NumericPayloadTokenFilter create(TokenStream input) { + return new NumericPayloadTokenFilter(input,payload,typeMatch); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/NumericPayloadTokenFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.payloads; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link TokenOffsetPayloadTokenFilter}. + *
+ * <fieldType name="text_tokenoffset" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.TokenOffsetPayloadTokenFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class TokenOffsetPayloadTokenFilterFactory extends TokenFilterFactory { + public TokenOffsetPayloadTokenFilter create(TokenStream input) { + return new TokenOffsetPayloadTokenFilter(input); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TokenOffsetPayloadTokenFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.payloads; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link TypeAsPayloadTokenFilter}. + *
+ * <fieldType name="text_typeaspayload" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.TypeAsPayloadTokenFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class TypeAsPayloadTokenFilterFactory extends TokenFilterFactory { + public TypeAsPayloadTokenFilter create(TokenStream input) { + return new TypeAsPayloadTokenFilter(input); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/payloads/TypeAsPayloadTokenFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilterFactory.java (working copy) @@ -0,0 +1,55 @@ +package org.apache.lucene.analysis.position; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.position.PositionFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link PositionFilter}. + * Set the positionIncrement of all tokens to the "positionIncrement", except the first return token which retains its + * original positionIncrement value. The default positionIncrement value is zero. + *
+ * <fieldType name="text_position" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.PositionFilterFactory" positionIncrement="0"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * + * @see org.apache.lucene.analysis.position.PositionFilter + * @since solr 1.4 + */ +public class PositionFilterFactory extends TokenFilterFactory { + private int positionIncrement; + + @Override + public void init(Map args) { + super.init(args); + positionIncrement = getInt("positionIncrement", 0); + } + + public PositionFilter create(TokenStream input) { + return new PositionFilter(input, positionIncrement); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/position/PositionFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.pt; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.pt.PortugueseLightStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link PortugueseLightStemFilter}. + *
+ * <fieldType name="text_ptlgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.PortugueseLightStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class PortugueseLightStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new PortugueseLightStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.pt; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.pt.PortugueseMinimalStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link PortugueseMinimalStemFilter}. + *
+ * <fieldType name="text_ptminstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.PortugueseMinimalStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class PortugueseMinimalStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new PortugueseMinimalStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.pt; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.pt.PortugueseStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link PortugueseStemFilter}. + *
+ * <fieldType name="text_ptstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.PortugueseStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class PortugueseStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new PortugueseStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilterFactory.java (working copy) @@ -0,0 +1,43 @@ +package org.apache.lucene.analysis.reverse; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link ReverseStringFilter}. + *
+ * <fieldType name="text_rvsstr" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.ReverseStringFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * + * @since solr 1.4 + */ +public class ReverseStringFilterFactory extends TokenFilterFactory { + public ReverseStringFilter create(TokenStream in) { + assureMatchVersion(); + return new ReverseStringFilter(luceneMatchVersion,in); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/reverse/ReverseStringFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.ru; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ru.RussianLightStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link RussianLightStemFilter}. + *
+ * <fieldType name="text_rulgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.RussianLightStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class RussianLightStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new RussianLightStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (working copy) @@ -0,0 +1,80 @@ +package org.apache.lucene.analysis.shingle; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link ShingleFilter}. + *
+ * <fieldType name="text_shingle" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
+ *             outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class ShingleFilterFactory extends TokenFilterFactory { + private int minShingleSize; + private int maxShingleSize; + private boolean outputUnigrams; + private boolean outputUnigramsIfNoShingles; + private String tokenSeparator; + + @Override + public void init(Map args) { + super.init(args); + maxShingleSize = getInt("maxShingleSize", + ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); + if (maxShingleSize < 2) { + throw new InitializationException("Invalid maxShingleSize (" + maxShingleSize + + ") - must be at least 2"); + } + minShingleSize = getInt("minShingleSize", + ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); + if (minShingleSize < 2) { + throw new InitializationException("Invalid minShingleSize (" + minShingleSize + + ") - must be at least 2"); + } + if (minShingleSize > maxShingleSize) { + throw new InitializationException("Invalid minShingleSize (" + minShingleSize + + ") - must be no greater than maxShingleSize (" + + maxShingleSize + ")"); + } + outputUnigrams = getBoolean("outputUnigrams", true); + outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false); + tokenSeparator = args.containsKey("tokenSeparator") + ? args.get("tokenSeparator") + : ShingleFilter.TOKEN_SEPARATOR; + } + public ShingleFilter create(TokenStream input) { + ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); + r.setOutputUnigrams(outputUnigrams); + r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); + r.setTokenSeparator(tokenSeparator); + return r; + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java (working copy) @@ -0,0 +1,91 @@ +package org.apache.lucene.analysis.snowball; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import java.io.IOException; + +import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.snowball.SnowballFilter; +import org.apache.lucene.analysis.util.*; +import org.tartarus.snowball.SnowballProgram; + +/** + * Factory for {@link SnowballFilter}, with configurable language + *

+ * Note: Use of the "Lovins" stemmer is not recommended, as it is implemented with reflection. + *

+ * <fieldType name="text_snowballstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.SnowballPorterFilterFactory" protected="protectedkeyword.txt" language="English"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * + */ +public class SnowballPorterFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + public static final String PROTECTED_TOKENS = "protected"; + + private String language = "English"; + private Class stemClass; + + + public void inform(ResourceLoader loader) { + String wordFiles = args.get(PROTECTED_TOKENS); + if (wordFiles != null) { + try { + protectedWords = getWordSet(loader, wordFiles, false); + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading protected words", e); + } + } + } + + private CharArraySet protectedWords = null; + + @Override + public void init(Map args) { + super.init(args); + final String cfgLanguage = args.get("language"); + if(cfgLanguage!=null) language = cfgLanguage; + + try { + stemClass = Class.forName("org.tartarus.snowball.ext." + language + "Stemmer"); + } catch (ClassNotFoundException e) { + throw new InitializationException("Can't find class for stemmer language " + language, e); + } + } + + public TokenFilter create(TokenStream input) { + SnowballProgram program; + try { + program = (SnowballProgram)stemClass.newInstance(); + } catch (Exception e) { + throw new InitializationException("Error instantiating stemmer for language " + language + "from class " + stemClass, e); + } + + if (protectedWords != null) + input = new KeywordMarkerFilter(input, protectedWords); + return new SnowballFilter(input, program); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballPorterFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicFilterFactory.java (working copy) @@ -0,0 +1,41 @@ +package org.apache.lucene.analysis.standard; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.standard.ClassicFilter; + +/** + * Factory for {@link ClassicFilter}. + *
+ * <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.ClassicTokenizerFactory"/>
+ *     <filter class="solr.ClassicFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * + */ +public class ClassicFilterFactory extends TokenFilterFactory { + public TokenFilter create(TokenStream input) { + return new ClassicFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerFactory.java (working copy) @@ -0,0 +1,57 @@ +package org.apache.lucene.analysis.standard; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.standard.ClassicTokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; +import java.util.Map; + +/** + * Factory for {@link ClassicTokenizer}. + *
+ * <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.ClassicTokenizerFactory" maxTokenLength="120"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * + */ + +public class ClassicTokenizerFactory extends TokenizerFactory { + + private int maxTokenLength; + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + maxTokenLength = getInt("maxTokenLength", + StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); + } + + public Tokenizer create(Reader input) { + ClassicTokenizer tokenizer = new ClassicTokenizer(luceneMatchVersion, input); + tokenizer.setMaxTokenLength(maxTokenLength); + return tokenizer; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilterFactory.java (working copy) @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.standard; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link StandardFilter}. + *
+ * <fieldType name="text_stndrd" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.StandardFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class StandardFilterFactory extends TokenFilterFactory { + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + } + + public StandardFilter create(TokenStream input) { + return new StandardFilter(luceneMatchVersion, input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java (working copy) @@ -0,0 +1,56 @@ +package org.apache.lucene.analysis.standard; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; +import java.util.Map; + +/** + * Factory for {@link StandardTokenizer}. + *
+ * <fieldType name="text_stndrd" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory" maxTokenLength="255"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ + +public class StandardTokenizerFactory extends TokenizerFactory { + + private int maxTokenLength; + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + maxTokenLength = getInt("maxTokenLength", + StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); + } + + public StandardTokenizer create(Reader input) { + StandardTokenizer tokenizer + = new StandardTokenizer(luceneMatchVersion, input); + tokenizer.setMaxTokenLength(maxTokenLength); + return tokenizer; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/StandardTokenizerFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerFactory.java (working copy) @@ -0,0 +1,56 @@ +package org.apache.lucene.analysis.standard; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; +import java.util.Map; + +/** + * Factory for {@link UAX29URLEmailTokenizer}. + *
+ * <fieldType name="text_urlemail" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.UAX29URLEmailTokenizerFactory" maxTokenLength="255"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * + */ + +public class UAX29URLEmailTokenizerFactory extends TokenizerFactory { + + private int maxTokenLength; + + @Override + public void init(Map args) { + super.init(args); + assureMatchVersion(); + maxTokenLength = getInt("maxTokenLength", + StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); + } + + public UAX29URLEmailTokenizer create(Reader input) { + UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, input); + tokenizer.setMaxTokenLength(maxTokenLength); + return tokenizer; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.sv; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.sv.SwedishLightStemFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link SwedishLightStemFilter}. + *
+ * <fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.LowerCaseFilterFactory"/>
+ *     <filter class="solr.SwedishLightStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class SwedishLightStemFilterFactory extends TokenFilterFactory { + public TokenStream create(TokenStream input) { + return new SwedishLightStemFilter(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java (working copy) @@ -0,0 +1,165 @@ +package org.apache.lucene.analysis.synonym; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.text.ParseException; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.synonym.SynonymFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; +import org.apache.lucene.analysis.synonym.WordnetSynonymParser; +import org.apache.lucene.analysis.util.*; +import org.apache.lucene.util.Version; + +/** + * Factory for {@link SynonymFilter}. + *
+ * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
+ *             format="solr" ignoreCase="false" expand="true" 
+ *             tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + private SynonymMap map; + private boolean ignoreCase; + + @Override + public TokenStream create(TokenStream input) { + // if the fst is null, it means there's actually no synonyms... just return the original stream + // as there is nothing to do here. + return map.fst == null ? input : new SynonymFilter(input, map, ignoreCase); + } + + @Override + public void inform(ResourceLoader loader) { + final boolean ignoreCase = getBoolean("ignoreCase", false); + this.ignoreCase = ignoreCase; + + String tf = args.get("tokenizerFactory"); + + final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf); + + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_50, reader) : factory.create(reader); + TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_50, tokenizer) : tokenizer; + return new TokenStreamComponents(tokenizer, stream); + } + }; + + String format = args.get("format"); + try { + if (format == null || format.equals("solr")) { + // TODO: expose dedup as a parameter? + map = loadSolrSynonyms(loader, true, analyzer); + } else if (format.equals("wordnet")) { + map = loadWordnetSynonyms(loader, true, analyzer); + } else { + // TODO: somehow make this more pluggable + throw new InitializationException("Unrecognized synonyms format: " + format); + } + } catch (Exception e) { + throw new InitializationException("Exception thrown while loading synonyms", e); + } + } + + /** + * Load synonyms from the solr format, "format=solr". + */ + private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { + final boolean expand = getBoolean("expand", true); + String synonyms = args.get("synonyms"); + if (synonyms == null) + throw new InitializationException("Missing required argument 'synonyms'."); + + CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer); + File synonymFile = new File(synonyms); + if (synonymFile.exists()) { + decoder.reset(); + parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); + } else { + List files = splitFileNames(synonyms); + for (String file : files) { + decoder.reset(); + parser.add(new InputStreamReader(loader.openResource(file), decoder)); + } + } + return parser.build(); + } + + /** + * Load synonyms from the wordnet format, "format=wordnet". + */ + private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { + final boolean expand = getBoolean("expand", true); + String synonyms = args.get("synonyms"); + if (synonyms == null) + throw new InitializationException("Missing required argument 'synonyms'."); + + CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + + WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer); + File synonymFile = new File(synonyms); + if (synonymFile.exists()) { + decoder.reset(); + parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); + } else { + List files = splitFileNames(synonyms); + for (String file : files) { + decoder.reset(); + parser.add(new InputStreamReader(loader.openResource(file), decoder)); + } + } + return parser.build(); + } + + // (there are no tests for this functionality) + private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname){ + TokenizerFactory tokFactory = loader.newInstance(cname, TokenizerFactory.class); + tokFactory.setLuceneMatchVersion(luceneMatchVersion); + tokFactory.init(args); + if (tokFactory instanceof ResourceLoaderAware) { + ((ResourceLoaderAware) tokFactory).inform(loader); + } + return tokFactory; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java (working copy) @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.th; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.th.ThaiWordFilter; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link ThaiWordFilter}. + *
+ * <fieldType name="text_thai" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.ThaiWordFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class ThaiWordFilterFactory extends TokenFilterFactory { + public ThaiWordFilter create(TokenStream input) { + assureMatchVersion(); + return new ThaiWordFilter(luceneMatchVersion, input); + } +} + Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilterFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilterFactory.java (working copy) @@ -0,0 +1,46 @@ +package org.apache.lucene.analysis.tr; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link TurkishLowerCaseFilter}. + *
+ * <fieldType name="text_trlwr" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.TurkishLowerCaseFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class TurkishLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + public TokenStream create(TokenStream input) { + return new TurkishLowerCaseFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AnalysisSPILoader.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AnalysisSPILoader.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AnalysisSPILoader.java (working copy) @@ -0,0 +1,103 @@ +package org.apache.lucene.analysis.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.Collections; +import java.util.Locale; +import java.util.Map; +import java.util.LinkedHashMap; +import java.util.Set; +import java.util.ServiceConfigurationError; + +import org.apache.lucene.util.SPIClassIterator; + +/** + * Helper class for loading named SPIs from classpath (e.g. Tokenizers, TokenStreams). + * @lucene.internal + */ +public final class AnalysisSPILoader { + + private final Map> services; + private final Class clazz; + + public AnalysisSPILoader(Class clazz) { + this(clazz, new String[] { clazz.getSimpleName() }); + } + + public AnalysisSPILoader(Class clazz, ClassLoader loader) { + this(clazz, new String[] { clazz.getSimpleName() }, loader); + } + + public AnalysisSPILoader(Class clazz, String[] suffixes) { + this(clazz, suffixes, Thread.currentThread().getContextClassLoader()); + } + + public AnalysisSPILoader(Class clazz, String[] suffixes, ClassLoader classloader) { + this.clazz = clazz; + final SPIClassIterator loader = SPIClassIterator.get(clazz, classloader); + final LinkedHashMap> services = new LinkedHashMap>(); + while (loader.hasNext()) { + final Class service = loader.next(); + final String clazzName = service.getSimpleName(); + String name = null; + for (String suffix : suffixes) { + if (clazzName.endsWith(suffix)) { + name = clazzName.substring(0, clazzName.length() - suffix.length()).toLowerCase(Locale.ROOT); + break; + } + } + if (name == null) { + throw new ServiceConfigurationError("The class name " + service.getName() + + " has wrong suffix, allowed are: " + Arrays.toString(suffixes)); + } + // only add the first one for each name, later services will be ignored + // this allows to place services before others in classpath to make + // them used instead of others + if (!services.containsKey(name)) { + services.put(name, service); + } + } + this.services = Collections.unmodifiableMap(services); + } + + public S newInstance(String name) { + final Class service = lookupClass(name); + try { + return service.newInstance(); + } catch (Exception e) { + throw new IllegalArgumentException("SPI class of type "+clazz.getName()+" with name '"+name+"' cannot be instantiated. " + + "This is likely due to a misconfiguration of the java class '" + service.getName() + "': ", e); + } + } + + public Class lookupClass(String name) { + final Class service = services.get(name.toLowerCase(Locale.ROOT)); + if (service != null) { + return service; + } else { + throw new IllegalArgumentException("A SPI class of type "+clazz.getName()+" with name '"+name+"' does not exist. "+ + "You need to add the corresponding JAR file supporting this SPI to your classpath."+ + "The current classpath supports the following names: "+availableServices()); + } + } + + public Set availableServices() { + return services.keySet(); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AnalysisSPILoader.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AnalysisSPILoader.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AnalysisSPILoader.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/AnalysisSPILoader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharFilterFactory.java (revision 1365483) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharFilterFactory.java (working copy) @@ -18,6 +18,7 @@ */ import java.io.Reader; +import java.util.Set; import org.apache.lucene.analysis.CharFilter; @@ -27,5 +28,32 @@ */ public abstract class CharFilterFactory extends AbstractAnalysisFactory { - public abstract CharFilter create(Reader input); + private static final AnalysisSPILoader loader = + getSPILoader(Thread.currentThread().getContextClassLoader()); + + /** + * Used by e.g. Apache Solr to get a correctly configured instance + * of {@link AnalysisSPILoader} from Solr's classpath. + * @lucene.internal + */ + public static AnalysisSPILoader getSPILoader(ClassLoader classloader) { + return new AnalysisSPILoader(CharFilterFactory.class, classloader); + } + + /** looks up a charfilter by name from context classpath */ + public static CharFilterFactory forName(String name) { + return loader.newInstance(name); + } + + /** looks up a charfilter class by name from context classpath */ + public static Class lookupClass(String name) { + return loader.lookupClass(name); + } + + /** returns a list of all available charfilter names */ + public static Set availableCharFilters() { + return loader.availableServices(); + } + + public abstract Reader create(Reader input); } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/TokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/TokenFilterFactory.java (revision 1365483) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/TokenFilterFactory.java (working copy) @@ -17,6 +17,8 @@ * limitations under the License. */ +import java.util.Set; + import org.apache.lucene.analysis.TokenStream; /** @@ -25,6 +27,34 @@ */ public abstract class TokenFilterFactory extends AbstractAnalysisFactory { + private static final AnalysisSPILoader loader = + getSPILoader(Thread.currentThread().getContextClassLoader()); + + /** + * Used by e.g. Apache Solr to get a correctly configured instance + * of {@link AnalysisSPILoader} from Solr's classpath. + * @lucene.internal + */ + public static AnalysisSPILoader getSPILoader(ClassLoader classloader) { + return new AnalysisSPILoader(TokenFilterFactory.class, + new String[] { "TokenFilterFactory", "FilterFactory" }, classloader); + } + + /** looks up a tokenfilter by name from context classpath */ + public static TokenFilterFactory forName(String name) { + return loader.newInstance(name); + } + + /** looks up a tokenfilter class by name from context classpath */ + public static Class lookupClass(String name) { + return loader.lookupClass(name); + } + + /** returns a list of all available tokenfilter names from context classpath */ + public static Set availableTokenFilters() { + return loader.availableServices(); + } + /** Transform the specified input TokenStream */ public abstract TokenStream create(TokenStream input); } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/util/TokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/util/TokenizerFactory.java (revision 1365483) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/util/TokenizerFactory.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.Tokenizer; import java.io.Reader; +import java.util.Set; /** * Abstract parent class for analysis factories that create {@link Tokenizer} @@ -27,6 +28,33 @@ */ public abstract class TokenizerFactory extends AbstractAnalysisFactory { + private static final AnalysisSPILoader loader = + getSPILoader(Thread.currentThread().getContextClassLoader()); + + /** + * Used by e.g. Apache Solr to get a correctly configured instance + * of {@link AnalysisSPILoader} from Solr's classpath. + * @lucene.internal + */ + public static AnalysisSPILoader getSPILoader(ClassLoader classloader) { + return new AnalysisSPILoader(TokenizerFactory.class, classloader); + } + + /** looks up a tokenizer by name from context classpath */ + public static TokenizerFactory forName(String name) { + return loader.newInstance(name); + } + + /** looks up a tokenizer class by name from context classpath */ + public static Class lookupClass(String name) { + return loader.lookupClass(name); + } + + /** returns a list of all available tokenizer names from context classpath */ + public static Set availableTokenizers() { + return loader.availableServices(); + } + /** Creates a TokenStream of the specified input */ public abstract Tokenizer create(Reader input); } Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerFactory.java (working copy) @@ -0,0 +1,41 @@ +package org.apache.lucene.analysis.wikipedia; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; + +/** + * Factory for {@link WikipediaTokenizer}. + *
+ * <fieldType name="text_wiki" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WikipediaTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class WikipediaTokenizerFactory extends TokenizerFactory { + // TODO: add support for WikipediaTokenizer's advanced options. + public Tokenizer create(Reader input) { + return new WikipediaTokenizer(input); + } +} Index: lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.CharFilterFactory =================================================================== --- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.CharFilterFactory (revision 0) +++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.CharFilterFactory (working copy) @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory +org.apache.lucene.analysis.charfilter.MappingCharFilterFactory +org.apache.lucene.analysis.fa.PersianCharFilterFactory +org.apache.lucene.analysis.pattern.PatternReplaceCharFilterFactory Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory =================================================================== --- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 0) +++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy) @@ -0,0 +1,90 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory +org.apache.lucene.analysis.ar.ArabicStemFilterFactory +org.apache.lucene.analysis.bg.BulgarianStemFilterFactory +org.apache.lucene.analysis.br.BrazilianStemFilterFactory +org.apache.lucene.analysis.cjk.CJKBigramFilterFactory +org.apache.lucene.analysis.cjk.CJKWidthFilterFactory +org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory +org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory +org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory +org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilterFactory +org.apache.lucene.analysis.core.LowerCaseFilterFactory +org.apache.lucene.analysis.core.StopFilterFactory +org.apache.lucene.analysis.core.TypeTokenFilterFactory +org.apache.lucene.analysis.cz.CzechStemFilterFactory +org.apache.lucene.analysis.de.GermanLightStemFilterFactory +org.apache.lucene.analysis.de.GermanMinimalStemFilterFactory +org.apache.lucene.analysis.de.GermanNormalizationFilterFactory +org.apache.lucene.analysis.de.GermanStemFilterFactory +org.apache.lucene.analysis.el.GreekLowerCaseFilterFactory +org.apache.lucene.analysis.el.GreekStemFilterFactory +org.apache.lucene.analysis.en.EnglishMinimalStemFilterFactory +org.apache.lucene.analysis.en.EnglishPossessiveFilterFactory +org.apache.lucene.analysis.en.KStemFilterFactory +org.apache.lucene.analysis.en.PorterStemFilterFactory +org.apache.lucene.analysis.es.SpanishLightStemFilterFactory +org.apache.lucene.analysis.fa.PersianNormalizationFilterFactory +org.apache.lucene.analysis.fi.FinnishLightStemFilterFactory +org.apache.lucene.analysis.fr.ElisionFilterFactory +org.apache.lucene.analysis.fr.FrenchLightStemFilterFactory +org.apache.lucene.analysis.fr.FrenchMinimalStemFilterFactory +org.apache.lucene.analysis.ga.IrishLowerCaseFilterFactory +org.apache.lucene.analysis.gl.GalicianMinimalStemFilterFactory +org.apache.lucene.analysis.gl.GalicianStemFilterFactory +org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory +org.apache.lucene.analysis.hi.HindiStemFilterFactory +org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory +org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory +org.apache.lucene.analysis.id.IndonesianStemFilterFactory +org.apache.lucene.analysis.in.IndicNormalizationFilterFactory +org.apache.lucene.analysis.it.ItalianLightStemFilterFactory +org.apache.lucene.analysis.lv.LatvianStemFilterFactory +org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory +org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory +org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory +org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory +org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory +org.apache.lucene.analysis.miscellaneous.LengthFilterFactory +org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory +org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory +org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilterFactory +org.apache.lucene.analysis.miscellaneous.TrimFilterFactory +org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory +org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory +org.apache.lucene.analysis.ngram.NGramFilterFactory +org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory +org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory +org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory +org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory +org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory +org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory +org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilterFactory +org.apache.lucene.analysis.position.PositionFilterFactory +org.apache.lucene.analysis.pt.PortugueseLightStemFilterFactory +org.apache.lucene.analysis.pt.PortugueseMinimalStemFilterFactory +org.apache.lucene.analysis.pt.PortugueseStemFilterFactory +org.apache.lucene.analysis.reverse.ReverseStringFilterFactory +org.apache.lucene.analysis.ru.RussianLightStemFilterFactory +org.apache.lucene.analysis.shingle.ShingleFilterFactory +org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory +org.apache.lucene.analysis.standard.ClassicFilterFactory +org.apache.lucene.analysis.standard.StandardFilterFactory +org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory +org.apache.lucene.analysis.synonym.SynonymFilterFactory +org.apache.lucene.analysis.th.ThaiWordFilterFactory +org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory Index: lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory =================================================================== --- lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (revision 0) +++ lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (working copy) @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.core.KeywordTokenizerFactory +org.apache.lucene.analysis.core.LetterTokenizerFactory +org.apache.lucene.analysis.core.LowerCaseTokenizerFactory +org.apache.lucene.analysis.core.WhitespaceTokenizerFactory +org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory +org.apache.lucene.analysis.ngram.NGramTokenizerFactory +org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory +org.apache.lucene.analysis.pattern.PatternTokenizerFactory +org.apache.lucene.analysis.standard.ClassicTokenizerFactory +org.apache.lucene.analysis.standard.StandardTokenizerFactory +org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory +org.apache.lucene.analysis.wikipedia.WikipediaTokenizerFactory Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java (working copy) @@ -0,0 +1,85 @@ +package org.apache.lucene.analysis.ar; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.fa.PersianCharFilterFactory; +import org.apache.lucene.analysis.standard.StandardTokenizerFactory; + +/** + * Simple tests to ensure the Arabic filter Factories are working. + */ +public class TestArabicFilters extends BaseTokenStreamTestCase { + + /** + * Test ArabicNormalizationFilterFactory + */ + public void testNormalizer() throws Exception { + Reader reader = new StringReader("الذين مَلكت أيمانكم"); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory(); + filterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + filterFactory.init(args); + Tokenizer tokenizer = factory.create(reader); + TokenStream stream = filterFactory.create(tokenizer); + assertTokenStreamContents(stream, new String[] {"الذين", "ملكت", "ايمانكم"}); + } + + /** + * Test ArabicStemFilterFactory + */ + public void testStemmer() throws Exception { + Reader reader = new StringReader("الذين مَلكت أيمانكم"); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory(); + normFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory(); + Map args = Collections.emptyMap(); + factory.init(args); + normFactory.init(args); + Tokenizer tokenizer = factory.create(reader); + TokenStream stream = normFactory.create(tokenizer); + stream = stemFactory.create(stream); + assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"}); + } + + /** + * Test PersianCharFilterFactory + */ + public void testPersianCharFilter() throws Exception { + Reader reader = new StringReader("می‌خورد"); + PersianCharFilterFactory charfilterFactory = new PersianCharFilterFactory(); + StandardTokenizerFactory tokenizerFactory = new StandardTokenizerFactory(); + tokenizerFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + tokenizerFactory.init(args); + TokenStream stream = tokenizerFactory.create(charfilterFactory.create(reader)); + assertTokenStreamContents(stream, new String[] { "می", "خورد" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicFilters.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemFilterFactory.java (working copy) @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.bg; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the Bulgarian stem filter factory is working. + */ +public class TestBulgarianStemFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually stems text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("компютри"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + BulgarianStemFilterFactory factory = new BulgarianStemFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "компютр" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/bg/TestBulgarianStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemFilterFactory.java (working copy) @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.br; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the Brazilian stem filter factory is working. + */ +public class TestBrazilianStemFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually stems and normalizes text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("Brasília"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "brasil" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/br/TestBrazilianStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilterFactory.java (working copy) @@ -0,0 +1,127 @@ +package org.apache.lucene.analysis.charfilter; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.*; + +/** + * Simple tests to ensure this factory is working + */ +public class TestHTMLStripCharFilterFactory extends BaseTokenStreamTestCase { + + + public void testNothingChanged() throws IOException { + // 11111111112 + // 012345678901234567890 + final String text = "this is only a test."; + HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); + Map args = new HashMap(); + args.put("escapedTags", "a, Title"); + factory.init(args); + CharFilter cs = factory.create(new StringReader(text)); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, + new String[] { "this", "is", "only", "a", "test." }, + new int[] { 0, 5, 8, 13, 15 }, + new int[] { 4, 7, 12, 14, 20 }); + } + + public void testNoEscapedTags() throws IOException { + // 11111111112222222222333333333344 + // 012345678901234567890123456789012345678901 + final String text = "this is only a test."; + HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); + Map args = new HashMap(); + factory.init(args); + CharFilter cs = factory.create(new StringReader(text)); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, + new String[] { "this", "is", "only", "a", "test." }, + new int[] { 3, 12, 18, 27, 32 }, + new int[] { 11, 14, 26, 28, 41 }); + } + + public void testEscapedTags() throws IOException { + // 11111111112222222222333333333344 + // 012345678901234567890123456789012345678901 + final String text = "this is only a test."; + HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); + Map args = new HashMap(); + args.put("escapedTags", "U i"); + factory.init(args); + CharFilter cs = factory.create(new StringReader(text)); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, + new String[] { "this", "is", "only", "a", "test." }, + new int[] { 0, 12, 18, 27, 29 }, + new int[] { 11, 14, 26, 28, 41 }); + } + + public void testSeparatorOnlyEscapedTags() throws IOException { + // 11111111112222222222333333333344 + // 012345678901234567890123456789012345678901 + final String text = "this is only a test."; + HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); + Map args = new HashMap(); + args.put("escapedTags", ",, , "); + factory.init(args); + CharFilter cs = factory.create(new StringReader(text)); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, + new String[] { "this", "is", "only", "a", "test." }, + new int[] { 3, 12, 18, 27, 32 }, + new int[] { 11, 14, 26, 28, 41 }); + } + + public void testEmptyEscapedTags() throws IOException { + // 11111111112222222222333333333344 + // 012345678901234567890123456789012345678901 + final String text = "this is only a test."; + HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); + Map args = new HashMap(); + args.put("escapedTags", ""); + factory.init(args); + CharFilter cs = factory.create(new StringReader(text)); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, + new String[] { "this", "is", "only", "a", "test." }, + new int[] { 3, 12, 18, 27, 32 }, + new int[] { 11, 14, 26, 28, 41 }); + } + + public void testSingleEscapedTag() throws IOException { + // 11111111112222222222333333333344 + // 012345678901234567890123456789012345678901 + final String text = "this is only a test."; + HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); + Map args = new HashMap(); + args.put("escapedTags", ", B\r\n\t"); + factory.init(args); + CharFilter cs = factory.create(new StringReader(text)); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, + new String[] { "this", "is", "only", "a", "test." }, + new int[] { 3, 12, 15, 27, 32 }, + new int[] { 11, 14, 26, 28, 41 }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilterFactory.java (working copy) @@ -0,0 +1,53 @@ +package org.apache.lucene.analysis.charfilter; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.util.LuceneTestCase; + +public class TestMappingCharFilterFactory extends LuceneTestCase { + public void testParseString() throws Exception { + + MappingCharFilterFactory f = new MappingCharFilterFactory(); + + try { + f.parseString( "\\" ); + fail( "escape character cannot be alone." ); + } + catch (InitializationException expected) {} + + assertEquals( "unexpected escaped characters", + "\\\"\n\t\r\b\f", f.parseString( "\\\\\\\"\\n\\t\\r\\b\\f" ) ); + assertEquals( "unexpected escaped characters", + "A", f.parseString( "\\u0041" ) ); + assertEquals( "unexpected escaped characters", + "AB", f.parseString( "\\u0041\\u0042" ) ); + + try { + f.parseString( "\\u000" ); + fail( "invalid length check." ); + } + catch (InitializationException expected) {} + + try { + f.parseString( "\\u123x" ); + fail( "invalid hex number check." ); + } + catch( NumberFormatException expected ){} + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestMappingCharFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +Date Author Id Revision HeadURL \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java (working copy) @@ -0,0 +1,55 @@ +package org.apache.lucene.analysis.cjk; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardTokenizer; + +/** + * Simple tests to ensure the CJK bigram factory is working. + */ +public class TestCJKBigramFilterFactory extends BaseTokenStreamTestCase { + public void testDefaults() throws Exception { + Reader reader = new StringReader("多くの学生が試験に落ちた。"); + CJKBigramFilterFactory factory = new CJKBigramFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader)); + assertTokenStreamContents(stream, + new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" }); + } + + public void testHanOnly() throws Exception { + Reader reader = new StringReader("多くの学生が試験に落ちた。"); + CJKBigramFilterFactory factory = new CJKBigramFilterFactory(); + Map args = new HashMap(); + args.put("hiragana", "false"); + factory.init(args); + TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader)); + assertTokenStreamContents(stream, + new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.cjk; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the CJKWidthFilterFactory is working + */ +public class TestCJKWidthFilterFactory extends BaseTokenStreamTestCase { + public void test() throws Exception { + Reader reader = new StringReader("Test 1234"); + CJKWidthFilterFactory factory = new CJKWidthFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "Test", "1234" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java (working copy) @@ -0,0 +1,107 @@ +package org.apache.lucene.analysis.commongrams; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.TestStopFilter; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoader; + +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; +import java.util.HashMap; + +/** + * Tests pretty much copied from StopFilterFactoryTest We use the test files + * used by the StopFilterFactoryTest TODO: consider creating separate test files + * so this won't break if stop filter test files change + **/ +public class TestCommonGramsFilterFactory extends BaseTokenStreamTestCase { + + public void testInform() throws Exception { + ResourceLoader loader = new ResourceAsStreamResourceLoader(TestStopFilter.class); + assertTrue("loader is null and it shouldn't be", loader != null); + CommonGramsFilterFactory factory = new CommonGramsFilterFactory(); + Map args = new HashMap(); + args.put("words", "stop-1.txt"); + args.put("ignoreCase", "true"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + CharArraySet words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 2, + words.size() == 2); + assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory + .isIgnoreCase() == true); + + factory = new CommonGramsFilterFactory(); + args.put("words", "stop-1.txt, stop-2.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 4, + words.size() == 4); + assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory + .isIgnoreCase() == true); + + factory = new CommonGramsFilterFactory(); + args.put("words", "stop-snowball.txt"); + args.put("format", "snowball"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + words = factory.getCommonWords(); + assertEquals(8, words.size()); + assertTrue(words.contains("he")); + assertTrue(words.contains("him")); + assertTrue(words.contains("his")); + assertTrue(words.contains("himself")); + assertTrue(words.contains("she")); + assertTrue(words.contains("her")); + assertTrue(words.contains("hers")); + assertTrue(words.contains("herself")); + } + + /** + * If no words are provided, then a set of english default stopwords is used. + */ + public void testDefaults() throws Exception { + ResourceLoader loader = new ResourceAsStreamResourceLoader(TestStopFilter.class); + assertTrue("loader is null and it shouldn't be", loader != null); + CommonGramsFilterFactory factory = new CommonGramsFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + factory.inform(loader); + CharArraySet words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue(words.contains("the")); + Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, + new String[] { "testing", "testing_the", "the", "the_factory", "factory" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +Date Author Id Revision HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java (working copy) @@ -0,0 +1,107 @@ +package org.apache.lucene.analysis.commongrams; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.TestStopFilter; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoader; + +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; +import java.util.HashMap; + +/** + * Tests pretty much copied from StopFilterFactoryTest We use the test files + * used by the StopFilterFactoryTest TODO: consider creating separate test files + * so this won't break if stop filter test files change + **/ +public class TestCommonGramsQueryFilterFactory extends BaseTokenStreamTestCase { + + public void testInform() throws Exception { + ResourceLoader loader = new ResourceAsStreamResourceLoader(TestStopFilter.class); + assertTrue("loader is null and it shouldn't be", loader != null); + CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); + Map args = new HashMap(); + args.put("words", "stop-1.txt"); + args.put("ignoreCase", "true"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + CharArraySet words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 2, + words.size() == 2); + assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory + .isIgnoreCase() == true); + + factory = new CommonGramsQueryFilterFactory(); + args.put("words", "stop-1.txt, stop-2.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 4, + words.size() == 4); + assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory + .isIgnoreCase() == true); + + factory = new CommonGramsQueryFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + args.put("words", "stop-snowball.txt"); + args.put("format", "snowball"); + factory.init(args); + factory.inform(loader); + words = factory.getCommonWords(); + assertEquals(8, words.size()); + assertTrue(words.contains("he")); + assertTrue(words.contains("him")); + assertTrue(words.contains("his")); + assertTrue(words.contains("himself")); + assertTrue(words.contains("she")); + assertTrue(words.contains("her")); + assertTrue(words.contains("hers")); + assertTrue(words.contains("herself")); + } + + /** + * If no words are provided, then a set of english default stopwords is used. + */ + public void testDefaults() throws Exception { + ResourceLoader loader = new ResourceAsStreamResourceLoader(TestStopFilter.class); + assertTrue("loader is null and it shouldn't be", loader != null); + CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + factory.inform(loader); + CharArraySet words = factory.getCommonWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue(words.contains("the")); + Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, + new String[] { "testing_the", "the_factory" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/commongrams/TestCommonGramsQueryFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +Date Author Id Revision HeadURL \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/compoundDictionary.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/compoundDictionary.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/compoundDictionary.txt (working copy) @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# A set of words for testing the DictionaryCompound factory +soft +ball +team Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/compoundDictionary.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/compoundDictionary.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/compoundDictionary.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/compoundDictionary.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/da_compoundDictionary.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/da_compoundDictionary.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/da_compoundDictionary.txt (working copy) @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# A set of words for testing the HyphenationCompound factory, +# in conjunction with the danish hyphenation grammar. +læse +hest Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/da_compoundDictionary.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/da_compoundDictionary.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/da_compoundDictionary.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/da_compoundDictionary.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java (working copy) @@ -0,0 +1,54 @@ +package org.apache.lucene.analysis.compound; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoader; + +/** + * Simple tests to ensure the Dictionary compound filter factory is working. + */ +public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually decompounds text. + */ + public void testDecompounding() throws Exception { + Reader reader = new StringReader("I like to play softball"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory(); + ResourceLoader loader = new ResourceAsStreamResourceLoader(getClass()); + Map args = new HashMap(); + args.put("dictionary", "compoundDictionary.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, + new String[] { "I", "like", "to", "play", "softball", "soft", "ball" }); + } + +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestDictionaryCompoundWordTokenFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java (working copy) @@ -0,0 +1,81 @@ +package org.apache.lucene.analysis.compound; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoader; + +/** + * Simple tests to ensure the Hyphenation compound filter factory is working. + */ +public class TestHyphenationCompoundWordTokenFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the factory works with hyphenation grammar+dictionary: using default options. + */ + public void testHyphenationWithDictionary() throws Exception { + Reader reader = new StringReader("min veninde som er lidt af en læsehest"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory(); + ResourceLoader loader = new ResourceAsStreamResourceLoader(getClass()); + Map args = new HashMap(); + args.put("hyphenator", "da_UTF8.xml"); + args.put("dictionary", "da_compoundDictionary.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + TokenStream stream = factory.create(tokenizer); + + assertTokenStreamContents(stream, + new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, + new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 } + ); + } + + /** + * Ensure the factory works with no dictionary: using hyphenation grammar only. + * Also change the min/max subword sizes from the default. When using no dictionary, + * its generally necessary to tweak these, or you get lots of expansions. + */ + public void testHyphenationOnly() throws Exception { + Reader reader = new StringReader("basketballkurv"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory(); + ResourceLoader loader = new ResourceAsStreamResourceLoader(getClass()); + Map args = new HashMap(); + args.put("hyphenator", "da_UTF8.xml"); + args.put("minSubwordSize", "2"); + args.put("maxSubwordSize", "4"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + TokenStream stream = factory.create(tokenizer); + + assertTokenStreamContents(stream, + new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" } + ); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-1.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-1.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-1.txt (working copy) @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +foo +bar \ No newline at end of file Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-1.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-1.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-1.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-1.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-2.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-2.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-2.txt (working copy) @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +junk +more \ No newline at end of file Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-2.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-2.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-2.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-2.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-snowball.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-snowball.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-snowball.txt (working copy) @@ -0,0 +1,10 @@ + | This is a file in snowball format, empty lines are ignored, '|' is a comment + | Additionally, multiple words can be on the same line, allowing stopwords to be + | arranged in tables (useful in some languages where they might inflect) + + | fictitious table below + +|third person singular +|Subject Object Possessive Reflexive +he him his himself| masculine +she her hers herself| feminine Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-snowball.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-snowball.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-snowball.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stop-snowball.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-1.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-1.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-1.txt (working copy) @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-1.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-1.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-1.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-1.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-2.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-2.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-2.txt (working copy) @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + + Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-2.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-2.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-2.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/stoptypes-2.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java (working copy) @@ -0,0 +1,176 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.lang.reflect.Modifier; +import java.io.Reader; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Collections; +import java.util.IdentityHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.EmptyTokenizer; +import org.apache.lucene.analysis.MockCharFilter; +import org.apache.lucene.analysis.MockFixedLengthPayloadFilter; +import org.apache.lucene.analysis.MockGraphTokenFilter; +import org.apache.lucene.analysis.MockHoleInjectingTokenFilter; +import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.MockVariableLengthPayloadFilter; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ValidatingTokenFilter; +import org.apache.lucene.analysis.core.TestRandomChains; +import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; +import org.apache.lucene.analysis.sinks.TeeSinkTokenFilter; +import org.apache.lucene.analysis.snowball.SnowballFilter; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Tests that any newly added Tokenizers/TokenFilters/CharFilters have a + * corresponding factory (and that the SPI configuration is correct) + */ +public class TestAllAnalyzersHaveFactories extends LuceneTestCase { + + // these are test-only components (e.g. test-framework) + private static final Set> testComponents = Collections.newSetFromMap(new IdentityHashMap,Boolean>()); + static { + Collections.>addAll(testComponents, + MockTokenizer.class, + MockCharFilter.class, + MockFixedLengthPayloadFilter.class, + MockGraphTokenFilter.class, + MockHoleInjectingTokenFilter.class, + MockRandomLookaheadTokenFilter.class, + MockTokenFilter.class, + MockVariableLengthPayloadFilter.class, + EmptyTokenizer.class, + ValidatingTokenFilter.class + ); + } + + // these are 'crazy' components like cachingtokenfilter. does it make sense to add factories for these? + private static final Set> crazyComponents = Collections.newSetFromMap(new IdentityHashMap,Boolean>()); + static { + Collections.>addAll(crazyComponents, + CachingTokenFilter.class, + TeeSinkTokenFilter.class + ); + } + + // these are oddly-named (either the actual analyzer, or its factory) + // they do actually have factories. + // TODO: clean this up! + private static final Set> oddlyNamedComponents = Collections.newSetFromMap(new IdentityHashMap,Boolean>()); + static { + Collections.>addAll(oddlyNamedComponents, + ReversePathHierarchyTokenizer.class, // this is supported via an option to PathHierarchyTokenizer's factory + SnowballFilter.class // this is called SnowballPorterFilterFactory + ); + } + + public void test() throws Exception { + List> analysisClasses = new ArrayList>(); + TestRandomChains.getClassesForPackage("org.apache.lucene.analysis", analysisClasses); + + for (final Class c : analysisClasses) { + final int modifiers = c.getModifiers(); + if ( + // don't waste time with abstract classes + Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) + || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() + || testComponents.contains(c) + || crazyComponents.contains(c) + || oddlyNamedComponents.contains(c) + || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c)) + ) { + continue; + } + + if (Tokenizer.class.isAssignableFrom(c)) { + String clazzName = c.getSimpleName(); + assertTrue(clazzName.endsWith("Tokenizer")); + String simpleName = clazzName.substring(0, clazzName.length() - 9); + TokenizerFactory instance = TokenizerFactory.forName(simpleName); + assertNotNull(instance); + try { + instance.setLuceneMatchVersion(TEST_VERSION_CURRENT); + instance.init(Collections.emptyMap()); + // TODO: provide fake ResourceLoader + if (!(instance instanceof ResourceLoaderAware)) { + assertSame(c, instance.create(new StringReader("")).getClass()); + } + } catch (InitializationException e) { + // TODO: For now pass because some factories have not yet a default config that always works, some require ResourceLoader + } + } else if (TokenFilter.class.isAssignableFrom(c)) { + String clazzName = c.getSimpleName(); + assertTrue(clazzName.endsWith("Filter")); + String simpleName = clazzName.substring(0, clazzName.length() - (clazzName.endsWith("TokenFilter") ? 11 : 6)); + TokenFilterFactory instance = TokenFilterFactory.forName(simpleName); + assertNotNull(instance); + try { + instance.setLuceneMatchVersion(TEST_VERSION_CURRENT); + instance.init(Collections.emptyMap()); + // TODO: provide fake ResourceLoader + if (!(instance instanceof ResourceLoaderAware)) { + Class createdClazz = instance.create(new KeywordTokenizer(new StringReader(""))).getClass(); + // only check instance if factory have wrapped at all! + if (KeywordTokenizer.class != createdClazz) { + assertSame(c, createdClazz); + } + } + } catch (InitializationException e) { + // TODO: For now pass because some factories have not yet a default config that always works, some require ResourceLoader + } + } else if (CharFilter.class.isAssignableFrom(c)) { + String clazzName = c.getSimpleName(); + assertTrue(clazzName.endsWith("CharFilter")); + String simpleName = clazzName.substring(0, clazzName.length() - 10); + CharFilterFactory instance = CharFilterFactory.forName(simpleName); + assertNotNull(instance); + try { + instance.setLuceneMatchVersion(TEST_VERSION_CURRENT); + instance.init(Collections.emptyMap()); + // TODO: provide fake ResourceLoader + if (!(instance instanceof ResourceLoaderAware)) { + Class createdClazz = instance.create(new StringReader("")).getClass(); + // only check instance if factory have wrapped at all! + if (StringReader.class != createdClazz) { + assertSame(c, createdClazz); + } + } + } catch (InitializationException e) { + // TODO: For now pass because some factories have not yet a default config that always works, some require ResourceLoader + } + } + } + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java (working copy) @@ -0,0 +1,179 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.util.Collections; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.StringMockResourceLoader; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.util.TokenizerFactory; + +/** + * Sanity check some things about all factories, + * we do our best to see if we can sanely initialize it with + * no parameters and smoke test it, etc. + */ +// TODO: move this, TestRandomChains, and TestAllAnalyzersHaveFactories +// to an integration test module that sucks in all analysis modules. +// currently the only way to do this is via eclipse etc (LUCENE-3974) +public class TestFactories extends BaseTokenStreamTestCase { + public void test() throws IOException { + for (String tokenizer : TokenizerFactory.availableTokenizers()) { + doTestTokenizer(tokenizer); + } + + for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) { + doTestTokenFilter(tokenFilter); + } + + for (String charFilter : CharFilterFactory.availableCharFilters()) { + doTestCharFilter(charFilter); + } + } + + private void doTestTokenizer(String tokenizer) throws IOException { + TokenizerFactory factory = TokenizerFactory.forName(tokenizer); + if (initialize(factory)) { + // we managed to fully create an instance. check a few more things: + + // if it implements MultiTermAware, sanity check its impl + if (factory instanceof MultiTermAwareComponent) { + AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); + assertNotNull(mtc); + // its not ok to return e.g. a charfilter here: but a tokenizer could wrap a filter around it + assertFalse(mtc instanceof CharFilterFactory); + } + + // beast it just a little, it shouldnt throw exceptions: + // (it should have thrown them in initialize) + checkRandomData(random(), new FactoryAnalyzer(factory, null, null), 100, 20, false, false); + } + } + + private void doTestTokenFilter(String tokenfilter) throws IOException { + TokenFilterFactory factory = TokenFilterFactory.forName(tokenfilter); + if (initialize(factory)) { + // we managed to fully create an instance. check a few more things: + + // if it implements MultiTermAware, sanity check its impl + if (factory instanceof MultiTermAwareComponent) { + AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); + assertNotNull(mtc); + // its not ok to return a charfilter or tokenizer here, this makes no sense + assertTrue(mtc instanceof TokenFilterFactory); + } + + // beast it just a little, it shouldnt throw exceptions: + // (it should have thrown them in initialize) + checkRandomData(random(), new FactoryAnalyzer(assertingTokenizer, factory, null), 100, 20, false, false); + } + } + + private void doTestCharFilter(String charfilter) throws IOException { + CharFilterFactory factory = CharFilterFactory.forName(charfilter); + if (initialize(factory)) { + // we managed to fully create an instance. check a few more things: + + // if it implements MultiTermAware, sanity check its impl + if (factory instanceof MultiTermAwareComponent) { + AbstractAnalysisFactory mtc = ((MultiTermAwareComponent) factory).getMultiTermComponent(); + assertNotNull(mtc); + // its not ok to return a tokenizer or tokenfilter here, this makes no sense + assertTrue(mtc instanceof CharFilterFactory); + } + + // beast it just a little, it shouldnt throw exceptions: + // (it should have thrown them in initialize) + checkRandomData(random(), new FactoryAnalyzer(assertingTokenizer, null, factory), 100, 20, false, false); + } + } + + /** tries to initialize a factory with no arguments */ + private boolean initialize(AbstractAnalysisFactory factory) { + boolean success = false; + try { + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(Collections.emptyMap()); + success = true; + } catch (InitializationException ignored) { + // its ok if we dont provide the right parameters to throw this + } + + if (factory instanceof ResourceLoaderAware) { + success = false; + try { + ((ResourceLoaderAware) factory).inform(new StringMockResourceLoader("")); + success = true; + } catch (InitializationException ignored) { + // its ok if the right files arent available or whatever to throw this + } + } + return success; + } + + // some silly classes just so we can use checkRandomData + private TokenizerFactory assertingTokenizer = new TokenizerFactory() { + @Override + public Tokenizer create(Reader input) { + return new MockTokenizer(input); + } + }; + + private static class FactoryAnalyzer extends Analyzer { + final TokenizerFactory tokenizer; + final CharFilterFactory charFilter; + final TokenFilterFactory tokenfilter; + + FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) { + assert tokenizer != null; + this.tokenizer = tokenizer; + this.charFilter = charFilter; + this.tokenfilter = tokenfilter; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer tf = tokenizer.create(reader); + if (tokenfilter != null) { + return new TokenStreamComponents(tf, tokenfilter.create(tf)); + } else { + return new TokenStreamComponents(tf); + } + } + + @Override + protected Reader initReader(String fieldName, Reader reader) { + if (charFilter != null) { + return charFilter.create(reader); + } else { + return reader; + } + } + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestFactories.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (revision 1365483) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (working copy) @@ -235,7 +235,7 @@ private static Constructor castConstructor(Class instanceClazz, Constructor ctor) { return (Constructor) ctor; } - private static void getClassesForPackage(String pckgname, List> classes) throws Exception { + static void getClassesForPackage(String pckgname, List> classes) throws Exception { final ClassLoader cld = TestRandomChains.class.getClassLoader(); final String path = pckgname.replace('.', '/'); final Enumeration resources = cld.getResources(path); Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java (working copy) @@ -0,0 +1,76 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoader; + +import java.util.Map; +import java.util.HashMap; + +/** + * + * + **/ +public class TestStopFilterFactory extends BaseTokenStreamTestCase { + + public void testInform() throws Exception { + ResourceLoader loader = new ResourceAsStreamResourceLoader(getClass()); + assertTrue("loader is null and it shouldn't be", loader != null); + StopFilterFactory factory = new StopFilterFactory(); + Map args = new HashMap(); + args.put("words", "stop-1.txt"); + args.put("ignoreCase", "true"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + CharArraySet words = factory.getStopWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); + assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true); + + factory = new StopFilterFactory(); + args.put("words", "stop-1.txt, stop-2.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + words = factory.getStopWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); + assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true); + + factory = new StopFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + args.put("words", "stop-snowball.txt"); + args.put("format", "snowball"); + factory.init(args); + factory.inform(loader); + words = factory.getStopWords(); + assertEquals(8, words.size()); + assertTrue(words.contains("he")); + assertTrue(words.contains("him")); + assertTrue(words.contains("his")); + assertTrue(words.contains("himself")); + assertTrue(words.contains("she")); + assertTrue(words.contains("her")); + assertTrue(words.contains("hers")); + assertTrue(words.contains("herself")); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestStopFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java (working copy) @@ -0,0 +1,105 @@ +package org.apache.lucene.analysis.core; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.NumericTokenStream; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/** + * Testcase for {@link TypeTokenFilterFactory} + */ +public class TestTypeTokenFilterFactory extends BaseTokenStreamTestCase { + + @Test + public void testInform() throws Exception { + ResourceLoader loader = new ResourceAsStreamResourceLoader(getClass()); + TypeTokenFilterFactory factory = new TypeTokenFilterFactory(); + Map args = new HashMap(); + args.put("types", "stoptypes-1.txt"); + args.put("enablePositionIncrements", "true"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + Set types = factory.getStopTypes(); + assertTrue("types is null and it shouldn't be", types != null); + assertTrue("types Size: " + types.size() + " is not: " + 2, types.size() == 2); + assertTrue("enablePositionIncrements was set to true but not correctly parsed", factory.isEnablePositionIncrements()); + + factory = new TypeTokenFilterFactory(); + args.put("types", "stoptypes-1.txt, stoptypes-2.txt"); + args.put("enablePositionIncrements", "false"); + args.put("useWhitelist","true"); + factory.init(args); + factory.inform(loader); + types = factory.getStopTypes(); + assertTrue("types is null and it shouldn't be", types != null); + assertTrue("types Size: " + types.size() + " is not: " + 4, types.size() == 4); + assertTrue("enablePositionIncrements was set to false but not correctly parsed", !factory.isEnablePositionIncrements()); + } + + @Test + public void testCreationWithBlackList() throws Exception { + TypeTokenFilterFactory typeTokenFilterFactory = new TypeTokenFilterFactory(); + Map args = new HashMap(); + args.put("types", "stoptypes-1.txt, stoptypes-2.txt"); + args.put("enablePositionIncrements", "false"); + typeTokenFilterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + typeTokenFilterFactory.init(args); + NumericTokenStream input = new NumericTokenStream(); + input.setIntValue(123); + typeTokenFilterFactory.create(input); + } + + @Test + public void testCreationWithWhiteList() throws Exception { + TypeTokenFilterFactory typeTokenFilterFactory = new TypeTokenFilterFactory(); + Map args = new HashMap(); + args.put("types", "stoptypes-1.txt, stoptypes-2.txt"); + args.put("enablePositionIncrements", "false"); + args.put("useWhitelist","true"); + typeTokenFilterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + typeTokenFilterFactory.init(args); + NumericTokenStream input = new NumericTokenStream(); + input.setIntValue(123); + typeTokenFilterFactory.create(input); + } + + @Test + public void testMissingTypesParameter() throws Exception { + try { + TypeTokenFilterFactory typeTokenFilterFactory = new TypeTokenFilterFactory(); + Map args = new HashMap(); + args.put("enablePositionIncrements", "false"); + typeTokenFilterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + typeTokenFilterFactory.init(args); + typeTokenFilterFactory.inform(new ResourceAsStreamResourceLoader(getClass())); + fail("not supplying 'types' parameter should cause an InitializationException"); + } catch (InitializationException e) { + // everything ok + } + } + +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestTypeTokenFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +Date Author Id Revision HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemFilterFactory.java (working copy) @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.cz; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the Czech stem filter factory is working. + */ +public class TestCzechStemFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually stems text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("angličtí"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + CzechStemFilterFactory factory = new CzechStemFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "anglick" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/cz/TestCzechStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.de; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the German light stem factory is working. + */ +public class TestGermanLightStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("häuser"); + GermanLightStemFilterFactory factory = new GermanLightStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "haus" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.de; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the German minimal stem factory is working. + */ +public class TestGermanMinimalStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("bilder"); + GermanMinimalStemFilterFactory factory = new GermanMinimalStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "bild" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.de; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the German normalization factory is working. + */ +public class TestGermanNormalizationFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("weißbier"); + GermanNormalizationFilterFactory factory = new GermanNormalizationFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "weissbier" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanNormalizationFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilterFactory.java (working copy) @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.de; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the German stem filter factory is working. + */ +public class TestGermanStemFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually stems text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("Tischen"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + GermanStemFilterFactory factory = new GermanStemFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "tisch" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/de/TestGermanStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekLowerCaseFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekLowerCaseFilterFactory.java (working copy) @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.el; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the Greek lowercase filter factory is working. + */ +public class TestGreekLowerCaseFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually lowercases (and a bit more) greek text. + */ + public void testNormalization() throws Exception { + Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekLowerCaseFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekLowerCaseFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekLowerCaseFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemFilterFactory.java (working copy) @@ -0,0 +1,41 @@ +package org.apache.lucene.analysis.el; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.el.GreekLowerCaseFilter; + +/** + * Simple tests to ensure the Greek stem filter factory is working. + */ +public class TestGreekStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("άνθρωπος"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream normalized = new GreekLowerCaseFilter(TEST_VERSION_CURRENT, tokenizer); + GreekStemFilterFactory factory = new GreekStemFilterFactory(); + TokenStream stream = factory.create(normalized); + assertTokenStreamContents(stream, new String[] { "ανθρωπ" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/el/TestGreekStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.en; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the English minimal stem factory is working. + */ +public class TestEnglishMinimalStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("bricks"); + EnglishMinimalStemFilterFactory factory = new EnglishMinimalStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "brick" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestEnglishMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.en; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the kstem filter factory is working. + */ +public class TestKStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("bricks"); + KStemFilterFactory factory = new KStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "brick" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestKStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilterFactory.java (working copy) @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.en; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the Porter stem filter factory is working. + */ +public class TestPorterStemFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually stems text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("dogs"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + PorterStemFilterFactory factory = new PorterStemFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "dog" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/en/TestPorterStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.es; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Spanish Light stem factory is working. + */ +public class TestSpanishLightStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("sociedades"); + SpanishLightStemFilterFactory factory = new SpanishLightStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "sociedad" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/es/TestSpanishLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilterFactory.java (working copy) @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.fa; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the Persian normalization factory is working. + */ +public class TestPersianNormalizationFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually normalizes persian text. + */ + public void testNormalization() throws Exception { + Reader reader = new StringReader("های"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + PersianNormalizationFilterFactory factory = new PersianNormalizationFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "هاي" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianNormalizationFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.fi; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Finnish light stem factory is working. + */ +public class TestFinnishLightStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("aseistettujen"); + FinnishLightStemFilterFactory factory = new FinnishLightStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "aseistet" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/fi/TestFinnishLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/frenchArticles.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/frenchArticles.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/frenchArticles.txt (working copy) @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# A set of articles for testing the French Elision filter. +# Requiring a text file is a bit weird here... +l +m +t +qu +n +s +j Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/frenchArticles.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/frenchArticles.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/frenchArticles.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/frenchArticles.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElisionFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElisionFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElisionFilterFactory.java (working copy) @@ -0,0 +1,88 @@ +package org.apache.lucene.analysis.fr; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoader; + +/** + * Simple tests to ensure the French elision filter factory is working. + */ +public class TestElisionFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually normalizes text. + */ + public void testElision() throws Exception { + Reader reader = new StringReader("l'avion"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + ElisionFilterFactory factory = new ElisionFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + ResourceLoader loader = new ResourceAsStreamResourceLoader(getClass()); + Map args = new HashMap(); + args.put("articles", "frenchArticles.txt"); + factory.init(args); + factory.inform(loader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "avion" }); + } + + /** + * Test creating an elision filter without specifying any articles + */ + public void testDefaultArticles() throws Exception { + Reader reader = new StringReader("l'avion"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + ElisionFilterFactory factory = new ElisionFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + ResourceLoader loader = new ResourceAsStreamResourceLoader(getClass()); + factory.inform(loader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "avion" }); + } + + /** + * Test setting ignoreCase=true + */ + public void testCaseInsensitive() throws Exception { + Reader reader = new StringReader("L'avion"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + ElisionFilterFactory factory = new ElisionFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + ResourceLoader loader = new ResourceAsStreamResourceLoader(getClass()); + Map args = new HashMap(); + args.put("articles", "frenchArticles.txt"); + args.put("ignoreCase", "true"); + factory.init(args); + factory.inform(loader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "avion" }); + } + +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElisionFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElisionFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElisionFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestElisionFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.fr; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the French light stem factory is working. + */ +public class TestFrenchLightStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("administrativement"); + FrenchLightStemFilterFactory factory = new FrenchLightStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "administratif" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.fr; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the French minimal stem factory is working. + */ +public class TestFrenchMinimalStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("chevaux"); + FrenchMinimalStemFilterFactory factory = new FrenchMinimalStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "cheval" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/fr/TestFrenchMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.ga; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Irish lowercase filter factory is working. + */ +public class TestIrishLowerCaseFilterFactory extends BaseTokenStreamTestCase { + public void testCasing() throws Exception { + Reader reader = new StringReader("nAthair tUISCE hARD"); + IrishLowerCaseFilterFactory factory = new IrishLowerCaseFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "n-athair", "t-uisce", "hard" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/ga/TestIrishLowerCaseFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.gl; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Galician plural stem factory is working. + */ +public class TestGalicianMinimalStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("elefantes"); + GalicianMinimalStemFilterFactory factory = new GalicianMinimalStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "elefante" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.gl; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Galician stem factory is working. + */ +public class TestGalicianStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("cariñosa"); + GalicianStemFilterFactory factory = new GalicianStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "cariñ" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/gl/TestGalicianStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiFilters.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiFilters.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiFilters.java (working copy) @@ -0,0 +1,91 @@ +package org.apache.lucene.analysis.hi; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.in.IndicNormalizationFilterFactory; +import org.apache.lucene.analysis.standard.StandardTokenizerFactory; + +/** + * Simple tests to ensure the Hindi filter Factories are working. + */ +public class TestHindiFilters extends BaseTokenStreamTestCase { + /** + * Test IndicNormalizationFilterFactory + */ + public void testIndicNormalizer() throws Exception { + Reader reader = new StringReader("ত্‍ अाैर"); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory(); + filterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + filterFactory.init(args); + Tokenizer tokenizer = factory.create(reader); + TokenStream stream = filterFactory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "ৎ", "और" }); + } + + /** + * Test HindiNormalizationFilterFactory + */ + public void testHindiNormalizer() throws Exception { + Reader reader = new StringReader("क़िताब"); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory(); + HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory(); + hindiFilterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + hindiFilterFactory.init(args); + Tokenizer tokenizer = factory.create(reader); + TokenStream stream = indicFilterFactory.create(tokenizer); + stream = hindiFilterFactory.create(stream); + assertTokenStreamContents(stream, new String[] {"किताब"}); + } + + /** + * Test HindiStemFilterFactory + */ + public void testStemmer() throws Exception { + Reader reader = new StringReader("किताबें"); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory(); + HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory(); + HindiStemFilterFactory stemFactory = new HindiStemFilterFactory(); + stemFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + stemFactory.init(args); + Tokenizer tokenizer = factory.create(reader); + TokenStream stream = indicFilterFactory.create(tokenizer); + stream = hindiFilterFactory.create(stream); + stream = stemFactory.create(stream); + assertTokenStreamContents(stream, new String[] {"किताब"}); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiFilters.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiFilters.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiFilters.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/hi/TestHindiFilters.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.hu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Hungarian light stem factory is working. + */ +public class TestHungarianLightStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("házakat"); + HungarianLightStemFilterFactory factory = new HungarianLightStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "haz" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/hu/TestHungarianLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java (working copy) @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.hunspell; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader; + +/** + * Simple tests to ensure the Hunspell stemmer loads from factory + */ +public class TestHunspellStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + HunspellStemFilterFactory factory = new HunspellStemFilterFactory(); + Map args = new HashMap(); + args.put("dictionary", "test.dic"); + args.put("affix", "test.aff"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(new ResourceAsStreamResourceLoader(getClass())); + + Reader reader = new StringReader("abc"); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "ab" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemFilterFactory.java (working copy) @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.id; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the Indonesian stem filter factory is working. + */ +public class TestIndonesianStemFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually stems text. + */ + public void testStemming() throws Exception { + Reader reader = new StringReader("dibukukannya"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory(); + Map args = new HashMap(); + factory.init(args); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "buku" }); + } + + /** + * Test inflectional-only mode + */ + public void testStemmingInflectional() throws Exception { + Reader reader = new StringReader("dibukukannya"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory(); + Map args = new HashMap(); + args.put("stemDerivational", "false"); + factory.init(args); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "dibukukan" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/id/TestIndonesianStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.it; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Italian light stem factory is working. + */ +public class TestItalianLightStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("ragazzo ragazzi"); + ItalianLightStemFilterFactory factory = new ItalianLightStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "ragazz", "ragazz" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/it/TestItalianLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.lv; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Latvian stem factory is working. + */ +public class TestLatvianStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("tirgiem tirgus"); + LatvianStemFilterFactory factory = new LatvianStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "tirg", "tirg" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/lv/TestLatvianStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-1.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-1.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-1.txt (working copy) @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +foo +bar \ No newline at end of file Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-1.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-1.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-1.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-1.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-2.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-2.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-2.txt (working copy) @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +junk +more \ No newline at end of file Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-2.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-2.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-2.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/keep-2.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilterFactory.java (working copy) @@ -0,0 +1,223 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * + */ +public class TestCapitalizationFilterFactory extends BaseTokenStreamTestCase { + + public void testCapitalization() throws Exception + { + Map args = new HashMap(); + args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" ); + args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" ); + + CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init( args ); + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("kiTTEN"), MockTokenizer.WHITESPACE, false)), + new String[] { "Kitten" }); + + factory.forceFirstLetter = true; + + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("and"), MockTokenizer.WHITESPACE, false)), + new String[] { "And" }); + + //first is forced, but it's not a keep word, either + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("AnD"), MockTokenizer.WHITESPACE, false)), + new String[] { "And" }); + + factory.forceFirstLetter = false; + + //first is not forced, but it's not a keep word, either + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("AnD"), MockTokenizer.WHITESPACE, false)), + new String[] { "And" }); + + factory.forceFirstLetter = true; + + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("big"), MockTokenizer.WHITESPACE, false)), + new String[] { "Big" }); + + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("BIG"), MockTokenizer.WHITESPACE, false)), + new String[] { "BIG" }); + + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("Hello thEre my Name is Ryan"), MockTokenizer.KEYWORD, false)), + new String[] { "Hello there my name is ryan" }); + + // now each token + factory.onlyFirstWord = false; + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("Hello thEre my Name is Ryan"), MockTokenizer.WHITESPACE, false)), + new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }); + + // now only the long words + factory.minWordLength = 3; + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("Hello thEre my Name is Ryan"), MockTokenizer.WHITESPACE, false)), + new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }); + + // without prefix + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("McKinley"), MockTokenizer.WHITESPACE, false)), + new String[] { "Mckinley" }); + + // Now try some prefixes + factory = new CapitalizationFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + args.put( "okPrefix", "McK" ); // all words + factory.init( args ); + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("McKinley"), MockTokenizer.WHITESPACE, false)), + new String[] { "McKinley" }); + + // now try some stuff with numbers + factory.forceFirstLetter = false; + factory.onlyFirstWord = false; + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("1st 2nd third"), MockTokenizer.WHITESPACE, false)), + new String[] { "1st", "2nd", "Third" }); + + factory.forceFirstLetter = true; + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("the The the"), MockTokenizer.KEYWORD, false)), + new String[] { "The The the" }); + } + + public void testKeepIgnoreCase() throws Exception { + Map args = new HashMap(); + args.put( CapitalizationFilterFactory.KEEP, "kitten" ); + args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" ); + args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" ); + + CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init( args ); + factory.forceFirstLetter = true; + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("kiTTEN"), MockTokenizer.KEYWORD, false)), + new String[] { "KiTTEN" }); + + factory.forceFirstLetter = false; + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("kiTTEN"), MockTokenizer.KEYWORD, false)), + new String[] { "kiTTEN" }); + + factory.keep = null; + assertTokenStreamContents(factory.create( + new MockTokenizer(new StringReader("kiTTEN"), MockTokenizer.KEYWORD, false)), + new String[] { "Kitten" }); + } + + /** + * Test CapitalizationFilterFactory's minWordLength option. + * + * This is very weird when combined with ONLY_FIRST_WORD!!! + */ + public void testMinWordLength() throws Exception { + Map args = new HashMap(); + args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true"); + args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5"); + CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + Tokenizer tokenizer = new MockTokenizer(new StringReader( + "helo testing"), MockTokenizer.WHITESPACE, false); + TokenStream ts = factory.create(tokenizer); + assertTokenStreamContents(ts, new String[] {"helo", "Testing"}); + } + + /** + * Test CapitalizationFilterFactory's maxWordCount option with only words of 1 + * in each token (it should do nothing) + */ + public void testMaxWordCount() throws Exception { + Map args = new HashMap(); + args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2"); + CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + Tokenizer tokenizer = new MockTokenizer(new StringReader( + "one two three four"), MockTokenizer.WHITESPACE, false); + TokenStream ts = factory.create(tokenizer); + assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"}); + } + + /** + * Test CapitalizationFilterFactory's maxWordCount option when exceeded + */ + public void testMaxWordCount2() throws Exception { + Map args = new HashMap(); + args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2"); + CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + Tokenizer tokenizer = new MockTokenizer(new StringReader( + "one two three four"), MockTokenizer.KEYWORD, false); + TokenStream ts = factory.create(tokenizer); + assertTokenStreamContents(ts, new String[] {"one two three four"}); + } + + /** + * Test CapitalizationFilterFactory's maxTokenLength option when exceeded + * + * This is weird, it is not really a max, but inclusive (look at 'is') + */ + public void testMaxTokenLength() throws Exception { + Map args = new HashMap(); + args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2"); + CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + Tokenizer tokenizer = new MockTokenizer(new StringReader( + "this is a test"), MockTokenizer.WHITESPACE, false); + TokenStream ts = factory.create(tokenizer); + assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"}); + } + + /** + * Test CapitalizationFilterFactory's forceFirstLetter option + */ + public void testForceFirstLetter() throws Exception { + Map args = new HashMap(); + args.put(CapitalizationFilterFactory.KEEP, "kitten"); + args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true"); + CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + Tokenizer tokenizer = new MockTokenizer(new StringReader("kitten"), MockTokenizer.WHITESPACE, false); + TokenStream ts = factory.create(tokenizer); + assertTokenStreamContents(ts, new String[] {"Kitten"}); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +Date Author Id Revision HeadURL \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java (working copy) @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoader; + +import java.util.Map; +import java.util.HashMap; + +/** + * + * + **/ +public class TestKeepFilterFactory extends BaseTokenStreamTestCase { + + public void testInform() throws Exception { + ResourceLoader loader = new ResourceAsStreamResourceLoader(getClass()); + assertTrue("loader is null and it shouldn't be", loader != null); + KeepWordFilterFactory factory = new KeepWordFilterFactory(); + Map args = new HashMap(); + args.put("words", "keep-1.txt"); + args.put("ignoreCase", "true"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + CharArraySet words = factory.getWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); + + + factory = new KeepWordFilterFactory(); + args.put("words", "keep-1.txt, keep-2.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + words = factory.getWords(); + assertTrue("words is null and it shouldn't be", words != null); + assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); + + + + } +} \ No newline at end of file Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeepFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java (working copy) @@ -0,0 +1,68 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.StringMockResourceLoader; + +/** + * Simple tests to ensure the keyword marker filter factory is working. + */ +public class TestKeywordMarkerFilterFactory extends BaseTokenStreamTestCase { + public void testKeywords() throws IOException { + Reader reader = new StringReader("dogs cats"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory(); + Map args = new HashMap(); + ResourceLoader loader = new StringMockResourceLoader("cats"); + args.put("protected", "protwords.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + + TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); + assertTokenStreamContents(ts, new String[] { "dog", "cats" }); + } + + public void testKeywordsCaseInsensitive() throws IOException { + Reader reader = new StringReader("dogs cats Cats"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory(); + Map args = new HashMap(); + ResourceLoader loader = new StringMockResourceLoader("cats"); + args.put("protected", "protwords.txt"); + args.put("ignoreCase", "true"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + + TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); + assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestKeywordMarkerFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java (working copy) @@ -0,0 +1,50 @@ +package org.apache.lucene.analysis.miscellaneous; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +public class TestLengthFilterFactory extends BaseTokenStreamTestCase { + + public void test() throws IOException { + LengthFilterFactory factory = new LengthFilterFactory(); + Map args = new HashMap(); + args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); + args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); + // default: args.put("enablePositionIncrements", "false"); + factory.init(args); + String test = "foo foobar super-duper-trooper"; + TokenStream stream = factory.create(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 }); + + factory = new LengthFilterFactory(); + args = new HashMap(); + args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); + args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); + args.put("enablePositionIncrements", "true"); + factory.init(args); + stream = factory.create(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 }); + } +} \ No newline at end of file Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestLengthFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilterFactory.java (working copy) @@ -0,0 +1,80 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import java.util.Iterator; +import java.util.Arrays; + +/** Simple tests to ensure this factory is working */ +public class TestRemoveDuplicatesTokenFilterFactory extends BaseTokenStreamTestCase { + + public static Token tok(int pos, String t, int start, int end) { + Token tok = new Token(t,start,end); + tok.setPositionIncrement(pos); + return tok; + } + public static Token tok(int pos, String t) { + return tok(pos, t, 0,0); + } + + public void testDups(final String expected, final Token... tokens) + throws Exception { + + final Iterator toks = Arrays.asList(tokens).iterator(); + RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory(); + final TokenStream ts = factory.create + (new TokenStream() { + CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + @Override + public boolean incrementToken() { + if (toks.hasNext()) { + clearAttributes(); + Token tok = toks.next(); + termAtt.setEmpty().append(tok); + offsetAtt.setOffset(tok.startOffset(), tok.endOffset()); + posIncAtt.setPositionIncrement(tok.getPositionIncrement()); + return true; + } else { + return false; + } + } + }); + + assertTokenStreamContents(ts, expected.split("\\s")); + } + + public void testSimpleDups() throws Exception { + testDups("A B C D E" + ,tok(1,"A", 0, 4) + ,tok(1,"B", 5, 10) + ,tok(0,"B",11, 15) + ,tok(1,"C",16, 20) + ,tok(0,"D",16, 20) + ,tok(1,"E",21, 25) + ); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java (working copy) @@ -0,0 +1,69 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.StringMockResourceLoader; + +/** + * Simple tests to ensure the stemmer override filter factory is working. + */ +public class TestStemmerOverrideFilterFactory extends BaseTokenStreamTestCase { + public void testKeywords() throws IOException { + // our stemdict stems dogs to 'cat' + Reader reader = new StringReader("testing dogs"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory(); + Map args = new HashMap(); + ResourceLoader loader = new StringMockResourceLoader("dogs\tcat"); + args.put("dictionary", "stemdict.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + + TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); + assertTokenStreamContents(ts, new String[] { "test", "cat" }); + } + + public void testKeywordsCaseInsensitive() throws IOException { + Reader reader = new StringReader("testing DoGs"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory(); + Map args = new HashMap(); + ResourceLoader loader = new StringMockResourceLoader("dogs\tcat"); + args.put("dictionary", "stemdict.txt"); + args.put("ignoreCase", "true"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + + TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); + assertTokenStreamContents(ts, new String[] { "test", "cat" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure this factory is working + */ +public class TestTrimFilterFactory extends BaseTokenStreamTestCase { + public void testTrimming() throws Exception { + TrimFilterFactory factory = new TrimFilterFactory(); + Map args = new HashMap(); + args.put("updateOffsets", "false"); + factory.init(args); + TokenStream ts = factory.create(new MockTokenizer(new StringReader("trim me "), MockTokenizer.KEYWORD, false)); + assertTokenStreamContents(ts, new String[] { "trim me" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestTrimFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java (working copy) @@ -0,0 +1,164 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the NGram filter factories are working. + */ +public class TestNGramFilters extends BaseTokenStreamTestCase { + /** + * Test NGramTokenizerFactory + */ + public void testNGramTokenizer() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + NGramTokenizerFactory factory = new NGramTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "t", "e", "s", "t", "te", "es", "st" }); + } + /** + * Test NGramTokenizerFactory with min and max gram options + */ + public void testNGramTokenizer2() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + args.put("minGramSize", "2"); + args.put("maxGramSize", "3"); + NGramTokenizerFactory factory = new NGramTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "te", "es", "st", "tes", "est" }); + } + /** + * Test the NGramFilterFactory + */ + public void testNGramFilter() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + NGramFilterFactory factory = new NGramFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "t", "e", "s", "t", "te", "es", "st" }); + } + /** + * Test the NGramFilterFactory with min and max gram options + */ + public void testNGramFilter2() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + args.put("minGramSize", "2"); + args.put("maxGramSize", "3"); + NGramFilterFactory factory = new NGramFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "te", "es", "st", "tes", "est" }); + } + /** + * Test EdgeNGramTokenizerFactory + */ + public void testEdgeNGramTokenizer() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "t" }); + } + /** + * Test EdgeNGramTokenizerFactory with min and max gram size + */ + public void testEdgeNGramTokenizer2() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + args.put("minGramSize", "1"); + args.put("maxGramSize", "2"); + EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "t", "te" }); + } + /** + * Test EdgeNGramTokenizerFactory with side option + */ + public void testEdgeNGramTokenizer3() throws Exception { + Reader reader = new StringReader("ready"); + Map args = new HashMap(); + args.put("side", "back"); + EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "y" }); + } + /** + * Test EdgeNGramFilterFactory + */ + public void testEdgeNGramFilter() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "t" }); + } + /** + * Test EdgeNGramFilterFactory with min and max gram size + */ + public void testEdgeNGramFilter2() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + args.put("minGramSize", "1"); + args.put("maxGramSize", "2"); + EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "t", "te" }); + } + /** + * Test EdgeNGramFilterFactory with side option + */ + public void testEdgeNGramFilter3() throws Exception { + Reader reader = new StringReader("ready"); + Map args = new HashMap(); + args.put("side", "back"); + EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "y" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/TestNGramFilters.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.no; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Norwegian Light stem factory is working. + */ +public class TestNorwegianLightStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("epler eple"); + NorwegianLightStemFilterFactory factory = new NorwegianLightStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "epl", "epl" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.no; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Norwegian Minimal stem factory is working. + */ +public class TestNorwegianMinimalStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("eple eplet epler eplene eplets eplenes"); + NorwegianMinimalStemFilterFactory factory = new NorwegianMinimalStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/no/TestNorwegianMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilterFactory.java (working copy) @@ -0,0 +1,86 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.*; + +/** + * Simple tests to ensure this factory is working + */ +public class TestPatternReplaceCharFilterFactory extends BaseTokenStreamTestCase { + + // 1111 + // 01234567890123 + // this is test. + public void testNothingChange() throws IOException { + final String BLOCK = "this is test."; + PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); + Map args = new HashMap(); + args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); + args.put("replacement", "$1$2$3"); + factory.init(args); + CharFilter cs = factory.create( + new StringReader( BLOCK ) ); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, + new String[] { "this", "is", "test." }, + new int[] { 0, 5, 8 }, + new int[] { 4, 7, 13 }); + } + + // 012345678 + // aa bb cc + public void testReplaceByEmpty() throws IOException { + final String BLOCK = "aa bb cc"; + PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); + Map args = new HashMap(); + args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); + factory.init(args); + CharFilter cs = factory.create( + new StringReader( BLOCK ) ); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + ts.reset(); + assertFalse(ts.incrementToken()); + ts.end(); + ts.close(); + } + + // 012345678 + // aa bb cc + // aa#bb#cc + public void test1block1matchSameLength() throws IOException { + final String BLOCK = "aa bb cc"; + PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); + Map args = new HashMap(); + args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); + args.put("replacement", "$1#$2#$3"); + factory.init(args); + CharFilter cs = factory.create( + new StringReader( BLOCK ) ); + TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); + assertTokenStreamContents(ts, + new String[] { "aa#bb#cc" }, + new int[] { 0 }, + new int[] { 8 }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +Date Author Id Revision HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilterFactory.java (working copy) @@ -0,0 +1,46 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +/** + * Simple tests to ensure this factory is working + */ +public class TestPatternReplaceFilterFactory extends BaseTokenStreamTestCase { + + public void testReplaceAll() throws Exception { + String input = "aabfooaabfooabfoob ab caaaaaaaaab"; + PatternReplaceFilterFactory factory = new PatternReplaceFilterFactory(); + Map args = new HashMap(); + args.put("pattern", "a*b"); + args.put("replacement", "-"); + factory.init(args); + TokenStream ts = factory.create + (new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false)); + + assertTokenStreamContents(ts, + new String[] { "-foo-foo-foo-", "-", "c-" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizerFactory.java (working copy) @@ -0,0 +1,41 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; + +/** Simple Tests to ensure this factory is working */ +public class TestPatternTokenizerFactory extends BaseTokenStreamTestCase { + public void testFactory() throws Exception { + final String INPUT = "Günther Günther is here"; + + // create PatternTokenizer + Map args = new HashMap(); + args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" ); + PatternTokenizerFactory tokFactory = new PatternTokenizerFactory(); + tokFactory.init( args ); + TokenStream stream = tokFactory.create( new StringReader(INPUT) ); + assertTokenStreamContents(stream, + new String[] { "Günther", "Günther", "is", "here" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/payloads/TestDelimitedPayloadTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/payloads/TestDelimitedPayloadTokenFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/payloads/TestDelimitedPayloadTokenFilterFactory.java (working copy) @@ -0,0 +1,80 @@ +package org.apache.lucene.analysis.payloads; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; +import org.apache.lucene.analysis.payloads.FloatEncoder; +import org.apache.lucene.analysis.payloads.PayloadHelper; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.StringMockResourceLoader; + +public class TestDelimitedPayloadTokenFilterFactory extends BaseTokenStreamTestCase { + + public void testEncoder() throws Exception { + Map args = new HashMap(); + args.put(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR, "float"); + DelimitedPayloadTokenFilterFactory factory = new DelimitedPayloadTokenFilterFactory(); + factory.init(args); + ResourceLoader loader = new StringMockResourceLoader("solr/collection1"); + factory.inform(loader); + + TokenStream input = new MockTokenizer(new StringReader("the|0.1 quick|0.1 red|0.1"), MockTokenizer.WHITESPACE, false); + DelimitedPayloadTokenFilter tf = factory.create(input); + tf.reset(); + while (tf.incrementToken()){ + PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class); + assertTrue("payAttr is null and it shouldn't be", payAttr != null); + byte[] payData = payAttr.getPayload().bytes; + assertTrue("payData is null and it shouldn't be", payData != null); + assertTrue("payData is null and it shouldn't be", payData != null); + float payFloat = PayloadHelper.decodeFloat(payData); + assertTrue(payFloat + " does not equal: " + 0.1f, payFloat == 0.1f); + } + } + + public void testDelim() throws Exception { + Map args = new HashMap(); + args.put(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR, FloatEncoder.class.getName()); + args.put(DelimitedPayloadTokenFilterFactory.DELIMITER_ATTR, "*"); + DelimitedPayloadTokenFilterFactory factory = new DelimitedPayloadTokenFilterFactory(); + factory.init(args); + ResourceLoader loader = new StringMockResourceLoader("solr/collection1"); + factory.inform(loader); + + TokenStream input = new MockTokenizer(new StringReader("the*0.1 quick*0.1 red*0.1"), MockTokenizer.WHITESPACE, false); + DelimitedPayloadTokenFilter tf = factory.create(input); + tf.reset(); + while (tf.incrementToken()){ + PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class); + assertTrue("payAttr is null and it shouldn't be", payAttr != null); + byte[] payData = payAttr.getPayload().bytes; + assertTrue("payData is null and it shouldn't be", payData != null); + float payFloat = PayloadHelper.decodeFloat(payData); + assertTrue(payFloat + " does not equal: " + 0.1f, payFloat == 0.1f); + } + } +} + Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/payloads/TestDelimitedPayloadTokenFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/payloads/TestDelimitedPayloadTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/payloads/TestDelimitedPayloadTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/payloads/TestDelimitedPayloadTokenFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.pt; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Portuguese Light stem factory is working. + */ +public class TestPortugueseLightStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("evidentemente"); + PortugueseLightStemFilterFactory factory = new PortugueseLightStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "evident" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.pt; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Portuguese Minimal stem factory is working. + */ +public class TestPortugueseMinimalStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("questões"); + PortugueseMinimalStemFilterFactory factory = new PortugueseMinimalStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "questão" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseMinimalStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.pt; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Portuguese stem factory is working. + */ +public class TestPortugueseStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("maluquice"); + PortugueseStemFilterFactory factory = new PortugueseStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "maluc" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/pt/TestPortugueseStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilterFactory.java (working copy) @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.reverse; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the Reverse string filter factory is working. + */ +public class TestReverseStringFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually reverses text. + */ + public void testReversing() throws Exception { + Reader reader = new StringReader("simple test"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + ReverseStringFilterFactory factory = new ReverseStringFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "elpmis", "tset" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/reverse/TestReverseStringFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.ru; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Russian light stem factory is working. + */ +public class TestRussianLightStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("журналы"); + RussianLightStemFilterFactory factory = new RussianLightStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "журнал" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/ru/TestRussianLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilterFactory.java (working copy) @@ -0,0 +1,239 @@ +package org.apache.lucene.analysis.shingle; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Shingle filter factory works. + */ +public class TestShingleFilterFactory extends BaseTokenStreamTestCase { + /** + * Test the defaults + */ + public void testDefaults() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] {"this", "this is", "is", + "is a", "a", "a test", "test"}); + } + + /** + * Test with unigrams disabled + */ + public void testNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] {"this is", "is a", "a test"}); + } + + /** + * Test with a higher max shingle size + */ + public void testMaxShingleSize() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("maxShingleSize", "3"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] {"this", "this is", "this is a", "is", + "is a", "is a test", "a", "a test", "test"}); + } + + /** + * Test with higher min (and max) shingle size + */ + public void testMinShingleSize() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "this", "this is a", "this is a test", + "is", "is a test", "a", "test" }); + } + + /** + * Test with higher min (and max) shingle size and with unigrams disabled + */ + public void testMinShingleSizeNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "this is a", "this is a test", "is a test" }); + } + + /** + * Test with higher same min and max shingle size + */ + public void testEqualMinAndMaxShingleSize() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "3"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "this", "this is a", "is", "is a test", "a", "test" }); + } + + /** + * Test with higher same min and max shingle size and with unigrams disabled + */ + public void testEqualMinAndMaxShingleSizeNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "3"); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "this is a", "is a test" }); + } + + /** + * Test with a non-default token separator + */ + public void testTokenSeparator() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("tokenSeparator", "=BLAH="); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "this", "this=BLAH=is", "is", "is=BLAH=a", + "a", "a=BLAH=test", "test" }); + } + + /** + * Test with a non-default token separator and with unigrams disabled + */ + public void testTokenSeparatorNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("tokenSeparator", "=BLAH="); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "this=BLAH=is", "is=BLAH=a", "a=BLAH=test" }); + } + + /** + * Test with an empty token separator + */ + public void testEmptyTokenSeparator() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("tokenSeparator", ""); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "this", "thisis", "is", "isa", "a", "atest", "test" }); + } + + /** + * Test with higher min (and max) shingle size + * and with a non-default token separator + */ + public void testMinShingleSizeAndTokenSeparator() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + args.put("tokenSeparator", "=BLAH="); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "this", "this=BLAH=is=BLAH=a", + "this=BLAH=is=BLAH=a=BLAH=test", "is", + "is=BLAH=a=BLAH=test", "a", "test" }); + } + + /** + * Test with higher min (and max) shingle size + * and with a non-default token separator + * and with unigrams disabled + */ + public void testMinShingleSizeAndTokenSeparatorNoUnigrams() throws Exception { + Reader reader = new StringReader("this is a test"); + Map args = new HashMap(); + args.put("minShingleSize", "3"); + args.put("maxShingleSize", "4"); + args.put("tokenSeparator", "=BLAH="); + args.put("outputUnigrams", "false"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, + new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test", + "is=BLAH=a=BLAH=test", }); + } + + /** + * Test with unigrams disabled except when there are no shingles, with + * a single input token. Using default min/max shingle sizes: 2/2. No + * shingles will be created, since there are fewer input tokens than + * min shingle size. However, because outputUnigramsIfNoShingles is + * set to true, even though outputUnigrams is set to false, one + * unigram should be output. + */ + public void testOutputUnigramsIfNoShingles() throws Exception { + Reader reader = new StringReader("test"); + Map args = new HashMap(); + args.put("outputUnigrams", "false"); + args.put("outputUnigramsIfNoShingles", "true"); + ShingleFilterFactory factory = new ShingleFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "test" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java (working copy) @@ -0,0 +1,109 @@ +package org.apache.lucene.analysis.snowball; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.StringMockResourceLoader; +import org.tartarus.snowball.ext.EnglishStemmer; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class TestSnowballPorterFilterFactory extends BaseTokenStreamTestCase { + + public void test() throws IOException { + EnglishStemmer stemmer = new EnglishStemmer(); + String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"}; + String[] gold = new String[test.length]; + for (int i = 0; i < test.length; i++) { + stemmer.setCurrent(test[i]); + stemmer.stem(); + gold[i] = stemmer.getCurrent(); + } + + SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory(); + Map args = new HashMap(); + args.put("language", "English"); + + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(new StringMockResourceLoader("")); + Tokenizer tokenizer = new MockTokenizer( + new StringReader(join(test, ' ')), MockTokenizer.WHITESPACE, false); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, gold); + } + + String join(String[] stuff, char sep) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < stuff.length; i++) { + if (i > 0) { + sb.append(sep); + } + sb.append(stuff[i]); + } + return sb.toString(); + } + + class LinesMockSolrResourceLoader implements ResourceLoader { + List lines; + + LinesMockSolrResourceLoader(List lines) { + this.lines = lines; + } + + public List getLines(String resource) throws IOException { + return lines; + } + + public T newInstance(String cname, Class expectedType, String... subpackages) { + return null; + } + + public InputStream openResource(String resource) throws IOException { + return null; + } + } + + /** + * Test the protected words mechanism of SnowballPorterFilterFactory + */ + public void testProtected() throws Exception { + SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory(); + ResourceLoader loader = new StringMockResourceLoader("ridding"); + Map args = new HashMap(); + args.put("protected", "protwords.txt"); + args.put("language", "English"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(loader); + Reader reader = new StringReader("ridding of some stemming"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" }); + } +} + Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/snowball/TestSnowballPorterFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java (working copy) @@ -0,0 +1,191 @@ +package org.apache.lucene.analysis.standard; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizerFactory; +import org.apache.lucene.analysis.core.LetterTokenizerFactory; +import org.apache.lucene.analysis.core.LowerCaseTokenizerFactory; +import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory; + +/** + * Simple tests to ensure the standard lucene factories are working. + */ +public class TestStandardFactories extends BaseTokenStreamTestCase { + /** + * Test StandardTokenizerFactory + */ + public void testStandardTokenizer() throws Exception { + Reader reader = new StringReader("Wha\u0301t's this thing do?"); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"Wha\u0301t's", "this", "thing", "do" }); + } + + public void testStandardTokenizerMaxTokenLength() throws Exception { + StringBuilder builder = new StringBuilder(); + for (int i = 0 ; i < 100 ; ++i) { + builder.append("abcdefg"); // 7 * 100 = 700 char "word" + } + String longWord = builder.toString(); + String content = "one two three " + longWord + " four five six"; + Reader reader = new StringReader(content); + Map args = new HashMap(); + args.put("maxTokenLength", "1000"); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"one", "two", "three", longWord, "four", "five", "six" }); + } + + /** + * Test ClassicTokenizerFactory + */ + public void testClassicTokenizer() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + ClassicTokenizerFactory factory = new ClassicTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"What's", "this", "thing", "do" }); + } + + public void testClassicTokenizerMaxTokenLength() throws Exception { + StringBuilder builder = new StringBuilder(); + for (int i = 0 ; i < 100 ; ++i) { + builder.append("abcdefg"); // 7 * 100 = 700 char "word" + } + String longWord = builder.toString(); + String content = "one two three " + longWord + " four five six"; + Reader reader = new StringReader(content); + Map args = new HashMap(); + args.put("maxTokenLength", "1000"); + ClassicTokenizerFactory factory = new ClassicTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"one", "two", "three", longWord, "four", "five", "six" }); + } + + /** + * Test ClassicFilterFactory + */ + public void testStandardFilter() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + ClassicTokenizerFactory factory = new ClassicTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + ClassicFilterFactory filterFactory = new ClassicFilterFactory(); + filterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + filterFactory.init(args); + Tokenizer tokenizer = factory.create(reader); + TokenStream stream = filterFactory.create(tokenizer); + assertTokenStreamContents(stream, + new String[] {"What", "this", "thing", "do"}); + } + + /** + * Test KeywordTokenizerFactory + */ + public void testKeywordTokenizer() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + KeywordTokenizerFactory factory = new KeywordTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"What's this thing do?"}); + } + + /** + * Test WhitespaceTokenizerFactory + */ + public void testWhitespaceTokenizer() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"What's", "this", "thing", "do?"}); + } + + /** + * Test LetterTokenizerFactory + */ + public void testLetterTokenizer() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + LetterTokenizerFactory factory = new LetterTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"What", "s", "this", "thing", "do"}); + } + + /** + * Test LowerCaseTokenizerFactory + */ + public void testLowerCaseTokenizer() throws Exception { + Reader reader = new StringReader("What's this thing do?"); + LowerCaseTokenizerFactory factory = new LowerCaseTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"what", "s", "this", "thing", "do"}); + } + + /** + * Ensure the ASCIIFoldingFilterFactory works + */ + public void testASCIIFolding() throws Exception { + Reader reader = new StringReader("Česká"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + ASCIIFoldingFilterFactory factory = new ASCIIFoldingFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "Ceska" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestStandardFactories.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java (working copy) @@ -0,0 +1,192 @@ +package org.apache.lucene.analysis.standard; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; + +/** + * A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer + */ + +public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamTestCase { + + public void testUAX29URLEmailTokenizer() throws Exception { + Reader reader = new StringReader("Wha\u0301t's this thing do?"); + UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"Wha\u0301t's", "this", "thing", "do" }); + } + + public void testArabic() throws Exception { + Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008."); + UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا", + "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" }); + } + + public void testChinese() throws Exception { + Reader reader = new StringReader("我是中国人。 1234 Tests "); + UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"我", "是", "中", "国", "人", "1234", "Tests"}); + } + + public void testKorean() throws Exception { + Reader reader = new StringReader("안녕하세요 한글입니다"); + UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"안녕하세요", "한글입니다"}); + } + + public void testHyphen() throws Exception { + Reader reader = new StringReader("some-dashed-phrase"); + UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"some", "dashed", "phrase"}); + } + + // Test with some URLs from TestUAX29URLEmailTokenizer's + // urls.from.random.text.with.urls.txt + public void testURLs() throws Exception { + String textWithURLs + = "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on\n" + + " some extra\nWords thrown in here. " + + "http://c5-3486.bisynxu.FR/aI.YnNms/" + + " samba Halta gamba " + + "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n" + + "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n" + + "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m" + + " inter Locutio " + + "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n" + + "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7" + + " blah Sirrah woof " + + "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n"; + Reader reader = new StringReader(textWithURLs); + UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { + "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on", + "some", "extra", "Words", "thrown", "in", "here", + "http://c5-3486.bisynxu.FR/aI.YnNms/", + "samba", "Halta", "gamba", + "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R", + "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb", + "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m", + "inter", "Locutio", + "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/", + "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7", + "blah", "Sirrah", "woof", + "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4" + } + ); + } + + // Test with some emails from TestUAX29URLEmailTokenizer's + // email.addresses.from.random.text.with.email.addresses.txt + public void testEmails() throws Exception { + String textWithEmails + = " some extra\nWords thrown in here. " + + "dJ8ngFi@avz13m.CC\n" + + "kU-l6DS@[082.015.228.189]\n" + + "\"%U\u0012@?\\B\"@Fl2d.md" + + " samba Halta gamba " + + "Bvd#@tupjv.sn\n" + + "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n" + + "~+Kdz@3mousnl.SE\n" + + " inter Locutio " + + "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n" + + "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM" + + " blah Sirrah woof " + + "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n" + + "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n"; + Reader reader = new StringReader(textWithEmails); + UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { + "some", "extra", "Words", "thrown", "in", "here", + "dJ8ngFi@avz13m.CC", + "kU-l6DS@[082.015.228.189]", + "\"%U\u0012@?\\B\"@Fl2d.md", + "samba", "Halta", "gamba", + "Bvd#@tupjv.sn", + "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt", + "~+Kdz@3mousnl.SE", + "inter", "Locutio", + "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY", + "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM", + "blah", "Sirrah", "woof", + "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae", + "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H" + } + ); + } + + public void testMaxTokenLength() throws Exception { + StringBuilder builder = new StringBuilder(); + for (int i = 0 ; i < 100 ; ++i) { + builder.append("abcdefg"); // 7 * 100 = 700 char "word" + } + String longWord = builder.toString(); + String content = "one two three " + longWord + " four five six"; + Reader reader = new StringReader(content); + Map args = new HashMap(); + args.put("maxTokenLength", "1000"); + UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + Tokenizer stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] {"one", "two", "three", longWord, "four", "five", "six" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/standard/TestUAX29URLEmailTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.sv; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests to ensure the Swedish Light stem factory is working. + */ +public class TestSwedishLightStemFilterFactory extends BaseTokenStreamTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("äpplen äpple"); + SwedishLightStemFilterFactory factory = new SwedishLightStemFilterFactory(); + TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(stream, new String[] { "äppl", "äppl" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/sv/TestSwedishLightStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt (working copy) @@ -0,0 +1,31 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaa => aaaa +bbb => bbbb1 bbbb2 +ccc => cccc1,cccc2 +a\=>a => b\=>b +a\,a => b\,b +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma + Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java (working copy) @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.synonym; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.synonym.SynonymFilter; +import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader; +import org.apache.lucene.analysis.util.StringMockResourceLoader; + +public class TestSynonymFilterFactory extends BaseTokenStreamTestCase { + /** test that we can parse and use the solr syn file */ + public void testSynonyms() throws Exception { + SynonymFilterFactory factory = new SynonymFilterFactory(); + Map args = new HashMap(); + args.put("synonyms", "synonyms.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(new ResourceAsStreamResourceLoader(getClass())); + TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); + assertTrue(ts instanceof SynonymFilter); + assertTokenStreamContents(ts, + new String[] { "GB", "gib", "gigabyte", "gigabytes" }, + new int[] { 1, 0, 0, 0 }); + } + + /** if the synonyms are completely empty, test that we still analyze correctly */ + public void testEmptySynonyms() throws Exception { + SynonymFilterFactory factory = new SynonymFilterFactory(); + Map args = new HashMap(); + args.put("synonyms", "synonyms.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(new StringMockResourceLoader("")); // empty file! + TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(ts, new String[] { "GB" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java (working copy) @@ -0,0 +1,50 @@ +package org.apache.lucene.analysis.th; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.th.ThaiWordFilter; + +/** + * Simple tests to ensure the Thai word filter factory is working. + */ +public class TestThaiWordFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually decomposes text. + */ + public void testWordBreak() throws Exception { + assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE); + Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + ThaiWordFilterFactory factory = new ThaiWordFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] {"การ", "ที่", "ได้", + "ต้อง", "แสดง", "ว่า", "งาน", "ดี"}); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/th/TestThaiWordFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilterFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilterFactory.java (working copy) @@ -0,0 +1,42 @@ +package org.apache.lucene.analysis.tr; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * Simple tests to ensure the Turkish lowercase filter factory is working. + */ +public class TestTurkishLowerCaseFilterFactory extends BaseTokenStreamTestCase { + /** + * Ensure the filter actually lowercases text. + */ + public void testCasing() throws Exception { + Reader reader = new StringReader("AĞACI"); + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TurkishLowerCaseFilterFactory factory = new TurkishLowerCaseFilterFactory(); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "ağacı" }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilterFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilterFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilterFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/tr/TestTurkishLowerCaseFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/util/ResourceAsStreamResourceLoader.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/util/ResourceAsStreamResourceLoader.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/util/ResourceAsStreamResourceLoader.java (working copy) @@ -0,0 +1,84 @@ +package org.apache.lucene.analysis.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.CodingErrorAction; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.util.IOUtils; + +public class ResourceAsStreamResourceLoader implements ResourceLoader { + Class clazz; + + public ResourceAsStreamResourceLoader(Class clazz) { + this.clazz = clazz; + } + + @Override + public InputStream openResource(String resource) throws IOException { + return clazz.getResourceAsStream(resource); + } + + @Override + public List getLines(String resource) throws IOException { + BufferedReader input = null; + ArrayList lines; + try { + input = new BufferedReader(new InputStreamReader(openResource(resource), + IOUtils.CHARSET_UTF_8.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT))); + + lines = new ArrayList(); + for (String word=null; (word=input.readLine())!=null;) { + // skip initial bom marker + if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF') + word = word.substring(1); + // skip comments + if (word.startsWith("#")) continue; + word=word.trim(); + // skip blank lines + if (word.length()==0) continue; + lines.add(word); + } + } catch (CharacterCodingException ex) { + throw new RuntimeException("Error loading resource (wrong encoding?): " + resource, ex); + } finally { + if (input != null) + input.close(); + } + return lines; + } + + // TODO: do this subpackages thing... wtf is that? + @Override + public T newInstance(String cname, Class expectedType, String... subpackages) { + try { + Class clazz = Class.forName(cname).asSubclass(expectedType); + return clazz.newInstance(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/util/ResourceAsStreamResourceLoader.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/util/ResourceAsStreamResourceLoader.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/util/ResourceAsStreamResourceLoader.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/util/ResourceAsStreamResourceLoader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java (working copy) @@ -0,0 +1,51 @@ +package org.apache.lucene.analysis.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.List; + +/** Fake resource loader for tests: works if you want to fake reading a single file */ +public class StringMockResourceLoader implements ResourceLoader { + String text; + + public StringMockResourceLoader(String text) { + this.text = text; + } + + public List getLines(String resource) throws IOException { + return Arrays.asList(text.split("\n")); + } + + // TODO: do this subpackages thing... wtf is that? + public T newInstance(String cname, Class expectedType, String... subpackages) { + try { + Class clazz = Class.forName(cname).asSubclass(expectedType); + return clazz.newInstance(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public InputStream openResource(String resource) throws IOException { + return new ByteArrayInputStream(text.getBytes("UTF-8")); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/util/StringMockResourceLoader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestAnalysisSPILoader.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestAnalysisSPILoader.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestAnalysisSPILoader.java (working copy) @@ -0,0 +1,180 @@ +package org.apache.lucene.analysis.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory; +import org.apache.lucene.analysis.core.LowerCaseFilterFactory; +import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory; +import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory; +import org.apache.lucene.util.LuceneTestCase; + +public class TestAnalysisSPILoader extends LuceneTestCase { + + public void testLookupTokenizer() { + assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("Whitespace").getClass()); + assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("WHITESPACE").getClass()); + assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("whitespace").getClass()); + } + + public void testBogusLookupTokenizer() { + try { + TokenizerFactory.forName("sdfsdfsdfdsfsdfsdf"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + + try { + TokenizerFactory.forName("!(**#$U*#$*"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + } + + public void testLookupTokenizerClass() { + assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("Whitespace")); + assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("WHITESPACE")); + assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("whitespace")); + } + + public void testBogusLookupTokenizerClass() { + try { + TokenizerFactory.lookupClass("sdfsdfsdfdsfsdfsdf"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + + try { + TokenizerFactory.lookupClass("!(**#$U*#$*"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + } + + public void testAvailableTokenizers() { + assertTrue(TokenizerFactory.availableTokenizers().contains("whitespace")); + } + + public void testLookupTokenFilter() { + assertSame(LowerCaseFilterFactory.class, TokenFilterFactory.forName("Lowercase").getClass()); + assertSame(LowerCaseFilterFactory.class, TokenFilterFactory.forName("LOWERCASE").getClass()); + assertSame(LowerCaseFilterFactory.class, TokenFilterFactory.forName("lowercase").getClass()); + + assertSame(RemoveDuplicatesTokenFilterFactory.class, TokenFilterFactory.forName("RemoveDuplicates").getClass()); + assertSame(RemoveDuplicatesTokenFilterFactory.class, TokenFilterFactory.forName("REMOVEDUPLICATES").getClass()); + assertSame(RemoveDuplicatesTokenFilterFactory.class, TokenFilterFactory.forName("removeduplicates").getClass()); + } + + public void testBogusLookupTokenFilter() { + try { + TokenFilterFactory.forName("sdfsdfsdfdsfsdfsdf"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + + try { + TokenFilterFactory.forName("!(**#$U*#$*"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + } + + public void testLookupTokenFilterClass() { + assertSame(LowerCaseFilterFactory.class, TokenFilterFactory.lookupClass("Lowercase")); + assertSame(LowerCaseFilterFactory.class, TokenFilterFactory.lookupClass("LOWERCASE")); + assertSame(LowerCaseFilterFactory.class, TokenFilterFactory.lookupClass("lowercase")); + + assertSame(RemoveDuplicatesTokenFilterFactory.class, TokenFilterFactory.lookupClass("RemoveDuplicates")); + assertSame(RemoveDuplicatesTokenFilterFactory.class, TokenFilterFactory.lookupClass("REMOVEDUPLICATES")); + assertSame(RemoveDuplicatesTokenFilterFactory.class, TokenFilterFactory.lookupClass("removeduplicates")); + } + + public void testBogusLookupTokenFilterClass() { + try { + TokenFilterFactory.lookupClass("sdfsdfsdfdsfsdfsdf"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + + try { + TokenFilterFactory.lookupClass("!(**#$U*#$*"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + } + + public void testAvailableTokenFilters() { + assertTrue(TokenFilterFactory.availableTokenFilters().contains("lowercase")); + assertTrue(TokenFilterFactory.availableTokenFilters().contains("removeduplicates")); + } + + public void testLookupCharFilter() { + assertSame(HTMLStripCharFilterFactory.class, CharFilterFactory.forName("HTMLStrip").getClass()); + assertSame(HTMLStripCharFilterFactory.class, CharFilterFactory.forName("HTMLSTRIP").getClass()); + assertSame(HTMLStripCharFilterFactory.class, CharFilterFactory.forName("htmlstrip").getClass()); + } + + public void testBogusLookupCharFilter() { + try { + CharFilterFactory.forName("sdfsdfsdfdsfsdfsdf"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + + try { + CharFilterFactory.forName("!(**#$U*#$*"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + } + + public void testLookupCharFilterClass() { + assertSame(HTMLStripCharFilterFactory.class, CharFilterFactory.lookupClass("HTMLStrip")); + assertSame(HTMLStripCharFilterFactory.class, CharFilterFactory.lookupClass("HTMLSTRIP")); + assertSame(HTMLStripCharFilterFactory.class, CharFilterFactory.lookupClass("htmlstrip")); + } + + public void testBogusLookupCharFilterClass() { + try { + CharFilterFactory.lookupClass("sdfsdfsdfdsfsdfsdf"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + + try { + CharFilterFactory.lookupClass("!(**#$U*#$*"); + fail(); + } catch (IllegalArgumentException expected) { + // + } + } + + public void testAvailableCharFilters() { + assertTrue(CharFilterFactory.availableCharFilters().contains("htmlstrip")); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestAnalysisSPILoader.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestAnalysisSPILoader.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestAnalysisSPILoader.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestAnalysisSPILoader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/TestWikipediaTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/TestWikipediaTokenizerFactory.java (revision 0) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/TestWikipediaTokenizerFactory.java (working copy) @@ -0,0 +1,43 @@ +package org.apache.lucene.analysis.wikipedia; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; + +/** + * Simple tests to ensure the wikipedia tokenizer is working. + */ +public class TestWikipediaTokenizerFactory extends BaseTokenStreamTestCase { + public void testTokenizer() throws IOException { + Reader reader = new StringReader("This is a [[Category:foo]]"); + WikipediaTokenizerFactory factory = new WikipediaTokenizerFactory(); + Tokenizer tokenizer = factory.create(reader); + assertTokenStreamContents(tokenizer, + new String[] { "This", "is", "a", "foo" }, + new int[] { 0, 5, 8, 21 }, + new int[] { 4, 7, 9, 24 }, + new String[] { "", "", "", WikipediaTokenizer.CATEGORY }, + new int[] { 1, 1, 1, 1, }); + } +} Index: lucene/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/TestWikipediaTokenizerFactory.java =================================================================== --- lucene/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/TestWikipediaTokenizerFactory.java (revision 1365496) +++ lucene/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/TestWikipediaTokenizerFactory.java (working copy) Property changes on: lucene/analysis/common/src/test/org/apache/lucene/analysis/wikipedia/TestWikipediaTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java (revision 0) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java (working copy) @@ -0,0 +1,37 @@ +package org.apache.lucene.analysis.icu; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.icu.ICUFoldingFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Factory for {@link ICUFoldingFilter} */ +public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + + @Override + public TokenStream create(TokenStream input) { + return new ICUFoldingFilter(input); + } + + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java (revision 1365496) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java (working copy) Property changes on: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUFoldingFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2FilterFactory.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2FilterFactory.java (revision 0) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2FilterFactory.java (working copy) @@ -0,0 +1,87 @@ +package org.apache.lucene.analysis.icu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import com.ibm.icu.text.FilteredNormalizer2; +import com.ibm.icu.text.Normalizer2; +import com.ibm.icu.text.UnicodeSet; + +/** + * Factory for {@link ICUNormalizer2Filter} + *

+ * Supports the following attributes: + *

    + *
  • name: A Unicode Normalization Form, + * one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf. + *
  • mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc + * or nfkc, to get nfd or nfkd, respectively. + *
  • filter: A {@link UnicodeSet} pattern. Codepoints outside the set are + * always left unchanged. Default is [] (the null set, no filtering). + *
+ * @see ICUNormalizer2Filter + * @see Normalizer2 + * @see FilteredNormalizer2 + */ +public class ICUNormalizer2FilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + private Normalizer2 normalizer; + + // TODO: support custom normalization + @Override + public void init(Map args) { + super.init(args); + String name = args.get("name"); + if (name == null) + name = "nfkc_cf"; + String mode = args.get("mode"); + if (mode == null) + mode = "compose"; + + if (mode.equals("compose")) + normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE); + else if (mode.equals("decompose")) + normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE); + else + throw new InitializationException("Invalid mode: " + mode); + + String filter = args.get("filter"); + if (filter != null) { + UnicodeSet set = new UnicodeSet(filter); + if (!set.isEmpty()) { + set.freeze(); + normalizer = new FilteredNormalizer2(normalizer, set); + } + } + } + + public TokenStream create(TokenStream input) { + return new ICUNormalizer2Filter(input, normalizer); + } + + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2FilterFactory.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2FilterFactory.java (revision 1365496) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2FilterFactory.java (working copy) Property changes on: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2FilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilterFactory.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilterFactory.java (revision 0) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilterFactory.java (working copy) @@ -0,0 +1,73 @@ +package org.apache.lucene.analysis.icu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.icu.ICUTransformFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import com.ibm.icu.text.Transliterator; + +/** + * Factory for {@link ICUTransformFilter}. + *

+ * Supports the following attributes: + *

    + *
  • id (mandatory): A Transliterator ID, one from {@link Transliterator#getAvailableIDs()} + *
  • direction (optional): Either 'forward' or 'reverse'. Default is forward. + *
+ * @see Transliterator + */ +public class ICUTransformFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + private Transliterator transliterator; + + // TODO: add support for custom rules + @Override + public void init(Map args) { + super.init(args); + String id = args.get("id"); + if (id == null) { + throw new InitializationException("id is required."); + } + + int dir; + String direction = args.get("direction"); + if (direction == null || direction.equalsIgnoreCase("forward")) + dir = Transliterator.FORWARD; + else if (direction.equalsIgnoreCase("reverse")) + dir = Transliterator.REVERSE; + else + throw new InitializationException("invalid direction: " + direction); + + transliterator = Transliterator.getInstance(id, dir); + } + + public TokenStream create(TokenStream input) { + return new ICUTransformFilter(input, transliterator); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilterFactory.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilterFactory.java (revision 1365496) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilterFactory.java (working copy) Property changes on: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUTransformFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java (revision 0) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java (working copy) @@ -0,0 +1,33 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; +import org.apache.lucene.analysis.util.TokenizerFactory; + +/** Factory for {@link ICUTokenizer} */ +public class ICUTokenizerFactory extends TokenizerFactory { + // TODO: add support for custom configs + @Override + public Tokenizer create(Reader input) { + return new ICUTokenizer(input); + } +} Index: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java =================================================================== --- lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java (revision 1365496) +++ lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java (working copy) Property changes on: lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/icu/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory =================================================================== --- lucene/analysis/icu/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 0) +++ lucene/analysis/icu/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy) @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.icu.ICUFoldingFilterFactory +org.apache.lucene.analysis.icu.ICUNormalizer2FilterFactory +org.apache.lucene.analysis.icu.ICUTransformFilterFactory Index: lucene/analysis/icu/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory =================================================================== --- lucene/analysis/icu/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (revision 0) +++ lucene/analysis/icu/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (working copy) @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.icu.segmentation.ICUTokenizerFactory Index: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java =================================================================== --- lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java (revision 0) +++ lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java (working copy) @@ -0,0 +1,36 @@ +package org.apache.lucene.analysis.icu.segmentation; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; + +/** basic tests for {@link ICUTokenizerFactory} **/ +public class TestICUTokenizerFactory extends BaseTokenStreamTestCase { + public void testMixedText() throws Exception { + Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ"); + ICUTokenizerFactory factory = new ICUTokenizerFactory(); + TokenStream stream = factory.create(reader); + assertTokenStreamContents(stream, + new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", + "This", "is", "a", "test", "ກວ່າ", "ດອກ"}); + } +} Index: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java =================================================================== --- lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java (revision 1365496) +++ lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java (working copy) Property changes on: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java =================================================================== --- lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java (revision 0) +++ lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java (working copy) @@ -0,0 +1,40 @@ +package org.apache.lucene.analysis.icu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** basic tests for {@link ICUFoldingFilterFactory} */ +public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase { + + /** basic tests to ensure the folding is working */ + public void test() throws Exception { + Reader reader = new StringReader("Résumé"); + ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "resume" }); + } +} Index: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java =================================================================== --- lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java (revision 1365496) +++ lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java (working copy) Property changes on: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java =================================================================== --- lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java (revision 0) +++ lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java (working copy) @@ -0,0 +1,46 @@ +package org.apache.lucene.analysis.icu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** basic tests for {@link ICUNormalizer2FilterFactory} */ +public class TestICUNormalizer2FilterFactory extends BaseTokenStreamTestCase { + + /** Test nfkc_cf defaults */ + public void testDefaults() throws Exception { + Reader reader = new StringReader("This is a Test"); + ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" }); + } + + // TODO: add tests for different forms +} Index: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java =================================================================== --- lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java (revision 1365496) +++ lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java (working copy) Property changes on: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2FilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java =================================================================== --- lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java (revision 0) +++ lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java (working copy) @@ -0,0 +1,65 @@ +package org.apache.lucene.analysis.icu; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** basic tests for {@link ICUTransformFilterFactory} */ +public class TestICUTransformFilterFactory extends BaseTokenStreamTestCase { + + /** ensure the transform is working */ + public void test() throws Exception { + Reader reader = new StringReader("簡化字"); + ICUTransformFilterFactory factory = new ICUTransformFilterFactory(); + Map args = new HashMap(); + args.put("id", "Traditional-Simplified"); + factory.init(args); + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "简化字" }); + } + + /** test forward and reverse direction */ + public void testDirection() throws Exception { + // forward + Reader reader = new StringReader("Российская Федерация"); + ICUTransformFilterFactory factory = new ICUTransformFilterFactory(); + Map args = new HashMap(); + args.put("id", "Cyrillic-Latin"); + factory.init(args); + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "Rossijskaâ", "Federaciâ" }); + + // backward (invokes Latin-Cyrillic) + reader = new StringReader("Rossijskaâ Federaciâ"); + args.put("direction", "reverse"); + factory.init(args); + tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); + stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] { "Российская", "Федерация" }); + } +} Index: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java =================================================================== --- lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java (revision 1365496) +++ lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java (working copy) Property changes on: lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUTransformFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java (working copy) @@ -0,0 +1,41 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link org.apache.lucene.analysis.ja.JapaneseBaseFormFilter}. + *
+ * <fieldType name="text_ja" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.JapaneseTokenizerFactory"/>
+ *     <filter class="solr.JapaneseBaseFormFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * 
+ */ +public class JapaneseBaseFormFilterFactory extends TokenFilterFactory { + + @Override + public TokenStream create(TokenStream input) { + return new JapaneseBaseFormFilter(input); + } +} Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseBaseFormFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java (working copy) @@ -0,0 +1,65 @@ +package org.apache.lucene.analysis.ja; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; + +import java.io.Reader; +import java.util.Map; + +/** + * Factory for {@link org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter}. + *
+ * <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
+ *   <analyzer>
+ *     <charFilter class="solr.JapaneseIterationMarkCharFilterFactory normalizeKanji="true" normalizeKana="true"/>
+ *     <tokenizer class="solr.JapaneseTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ */ +public class JapaneseIterationMarkCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent { + + private static final String NORMALIZE_KANJI_PARAM = "normalizeKanji"; + + private static final String NORMALIZE_KANA_PARAM = "normalizeKana"; + + private boolean normalizeKanji = true; + + private boolean normalizeKana = true; + + @Override + public CharFilter create(Reader input) { + return new JapaneseIterationMarkCharFilter(input, normalizeKanji, normalizeKana); + } + + @Override + public void init(Map args) { + super.init(args); + normalizeKanji = getBoolean(NORMALIZE_KANJI_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT); + normalizeKana = getBoolean(NORMALIZE_KANA_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseIterationMarkCharFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java (working copy) @@ -0,0 +1,55 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link JapaneseKatakanaStemFilterFactory}. + *
+ * <fieldType name="text_ja" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.JapaneseTokenizerFactory"/>
+ *     <filter class="solr.JapaneseKatakanaStemFilterFactory"
+ *             minimumLength="4"/>
+ *   </analyzer>
+ * </fieldType>
+ * 
+ */ +public class JapaneseKatakanaStemFilterFactory extends TokenFilterFactory { + private static final String MINIMUM_LENGTH_PARAM = "minimumLength"; + private int minimumLength; + + @Override + public void init(Map args) { + super.init(args); + minimumLength = getInt(MINIMUM_LENGTH_PARAM, JapaneseKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH); + if (minimumLength < 2) { + throw new InitializationException("Illegal " + MINIMUM_LENGTH_PARAM + " " + minimumLength + " (must be 2 or greater)"); + } + } + + public TokenStream create(TokenStream input) { + return new JapaneseKatakanaStemFilter(input, minimumLength); + } +} Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java (working copy) @@ -0,0 +1,67 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter; +import org.apache.lucene.analysis.util.*; + +/** + * Factory for {@link org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter}. + *
+ * <fieldType name="text_ja" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.JapaneseTokenizerFactory"/>
+ *     <filter class="solr.JapanesePartOfSpeechStopFilterFactory"
+ *             tags="stopTags.txt" 
+ *             enablePositionIncrements="true"/>
+ *   </analyzer>
+ * </fieldType>
+ * 
+ */ +public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + private boolean enablePositionIncrements; + private Set stopTags; + + public void inform(ResourceLoader loader) { + String stopTagFiles = args.get("tags"); + enablePositionIncrements = getBoolean("enablePositionIncrements", false); + stopTags = null; + try { + CharArraySet cas = getWordSet(loader, stopTagFiles, false); + if (cas != null) { + stopTags = new HashSet(); + for (Object element : cas) { + char chars[] = (char[]) element; + stopTags.add(new String(chars)); + } + } + } catch (IOException e) { + throw new InitializationException("IOException thrown while loading tags", e); + } + } + + public TokenStream create(TokenStream stream) { + // if stoptags is null, it means the file is empty + return stopTags == null ? stream : new JapanesePartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags); + } +} Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapanesePartOfSpeechStopFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java (working copy) @@ -0,0 +1,51 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseReadingFormFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +import java.util.Map; + +/** + * Factory for {@link org.apache.lucene.analysis.ja.JapaneseReadingFormFilter}. + *
+ * <fieldType name="text_ja" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.JapaneseTokenizerFactory"/>
+ *     <filter class="solr.JapaneseReadingFormFilterFactory"
+ *             useRomaji="false"/>
+ *   </analyzer>
+ * </fieldType>
+ * 
+ */ +public class JapaneseReadingFormFilterFactory extends TokenFilterFactory { + private static final String ROMAJI_PARAM = "useRomaji"; + private boolean useRomaji; + + @Override + public void init(Map args) { + super.init(args); + useRomaji = getBoolean(ROMAJI_PARAM, false); + } + + public TokenStream create(TokenStream input) { + return new JapaneseReadingFormFilter(input, useRomaji); + } +} Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java (working copy) @@ -0,0 +1,108 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.Locale; +import java.util.Map; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ja.JapaneseTokenizer; +import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; +import org.apache.lucene.analysis.ja.dict.UserDictionary; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; + +/** + * Factory for {@link org.apache.lucene.analysis.ja.JapaneseTokenizer}. + *
+ * <fieldType name="text_ja" class="solr.TextField">
+ *   <analyzer>
+ *     <tokenizer class="solr.JapaneseTokenizerFactory"
+ *       mode="NORMAL"
+ *       userDictionary="user.txt"
+ *       userDictionaryEncoding="UTF-8"
+ *       discardPunctuation="true"
+ *     />
+ *     <filter class="solr.JapaneseBaseFormFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * 
+ */ +public class JapaneseTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware { + private static final String MODE = "mode"; + + private static final String USER_DICT_PATH = "userDictionary"; + + private static final String USER_DICT_ENCODING = "userDictionaryEncoding"; + + private static final String DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option + + private UserDictionary userDictionary; + + private Mode mode; + + private boolean discardPunctuation; + + @Override + public void inform(ResourceLoader loader) { + mode = getMode(args); + String userDictionaryPath = args.get(USER_DICT_PATH); + try { + if (userDictionaryPath != null) { + InputStream stream = loader.openResource(userDictionaryPath); + String encoding = args.get(USER_DICT_ENCODING); + if (encoding == null) { + encoding = IOUtils.UTF_8; + } + CharsetDecoder decoder = Charset.forName(encoding).newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + Reader reader = new InputStreamReader(stream, decoder); + userDictionary = new UserDictionary(reader); + } else { + userDictionary = null; + } + } catch (Exception e) { + throw new InitializationException("Exception thrown while loading dictionary", e); + } + discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true); + } + + @Override + public Tokenizer create(Reader input) { + return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode); + } + + private Mode getMode(Map args) { + String mode = args.get(MODE); + if (mode != null) { + return Mode.valueOf(mode.toUpperCase(Locale.ROOT)); + } else { + return JapaneseTokenizer.DEFAULT_MODE; + } + } +} Index: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java =================================================================== --- lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.CharFilterFactory =================================================================== --- lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.CharFilterFactory (revision 0) +++ lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.CharFilterFactory (working copy) @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilterFactory Index: lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory =================================================================== --- lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 0) +++ lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy) @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.ja.JapaneseBaseFormFilterFactory +org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilterFactory +org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilterFactory +org.apache.lucene.analysis.ja.JapaneseReadingFormFilterFactory Index: lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory =================================================================== --- lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (revision 0) +++ lucene/analysis/kuromoji/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (working copy) @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.ja.JapaneseTokenizerFactory Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java (revision 0) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java (working copy) @@ -0,0 +1,53 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.analysis.util.ResourceLoader; + +/** Fake resource loader for tests: works if you want to fake reading a single file */ +class StringMockResourceLoader implements ResourceLoader { + String text; + + public StringMockResourceLoader(String text) { + this.text = text; + } + + public List getLines(String resource) throws IOException { + return Arrays.asList(text.split("\n")); + } + + // TODO: do this subpackages thing... wtf is that? + public T newInstance(String cname, Class expectedType, String... subpackages) { + try { + Class clazz = Class.forName(cname).asSubclass(expectedType); + return clazz.newInstance(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + public InputStream openResource(String resource) throws IOException { + return new ByteArrayInputStream(text.getBytes("UTF-8")); + } +} Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java (revision 1365496) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java (working copy) Property changes on: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/StringMockResourceLoader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java (working copy) @@ -0,0 +1,45 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests for {@link JapaneseBaseFormFilterFactory} + */ +public class TestJapaneseBaseFormFilterFactory extends BaseTokenStreamTestCase { + public void testBasics() throws IOException { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); + tokenizerFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + tokenizerFactory.init(args); + tokenizerFactory.inform(new StringMockResourceLoader("")); + TokenStream ts = tokenizerFactory.create(new StringReader("それはまだ実験段階にあります")); + JapaneseBaseFormFilterFactory factory = new JapaneseBaseFormFilterFactory(); + ts = factory.create(ts); + assertTokenStreamContents(ts, + new String[] { "それ", "は", "まだ", "実験", "段階", "に", "ある", "ます" } + ); + } +} Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseBaseFormFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java (working copy) @@ -0,0 +1,98 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.CharFilter; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +/** + * Simple tests for {@link JapaneseIterationMarkCharFilterFactory} + */ +public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamTestCase { + + public void testIterationMarksWithKeywordTokenizer() throws IOException { + final String text = "時々馬鹿々々しいところゞゝゝミスヾ"; + JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); + CharFilter filter = filterFactory.create(new StringReader(text)); + TokenStream tokenStream = new MockTokenizer(filter, MockTokenizer.KEYWORD, false); + assertTokenStreamContents(tokenStream, new String[]{"時時馬鹿馬鹿しいところどころミスズ"}); + } + + public void testIterationMarksWithJapaneseTokenizer() throws IOException { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); + Map tokenizerArgs = Collections.emptyMap(); + tokenizerFactory.init(tokenizerArgs); + tokenizerFactory.inform(new StringMockResourceLoader("")); + + JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); + Map filterArgs = Collections.emptyMap(); + filterFactory.init(filterArgs); + + CharFilter filter = filterFactory.create( + new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") + ); + TokenStream tokenStream = tokenizerFactory.create(filter); + assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ"}); + } + + public void testKanjiOnlyIterationMarksWithJapaneseTokenizer() throws IOException { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); + Map tokenizerArgs = Collections.emptyMap(); + tokenizerFactory.init(tokenizerArgs); + tokenizerFactory.inform(new StringMockResourceLoader("")); + + JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); + Map filterArgs = new HashMap(); + filterArgs.put("normalizeKanji", "true"); + filterArgs.put("normalizeKana", "false"); + filterFactory.init(filterArgs); + + CharFilter filter = filterFactory.create( + new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") + ); + TokenStream tokenStream = tokenizerFactory.create(filter); + assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ"}); + } + + public void testKanaOnlyIterationMarksWithJapaneseTokenizer() throws IOException { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); + Map tokenizerArgs = Collections.emptyMap(); + tokenizerFactory.init(tokenizerArgs); + tokenizerFactory.inform(new StringMockResourceLoader("")); + + JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); + Map filterArgs = new HashMap(); + filterArgs.put("normalizeKanji", "false"); + filterArgs.put("normalizeKana", "true"); + filterFactory.init(filterArgs); + + CharFilter filter = filterFactory.create( + new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") + ); + TokenStream tokenStream = tokenizerFactory.create(filter); + assertTokenStreamContents(tokenStream, new String[]{"時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ"}); + } +} Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseIterationMarkCharFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java (working copy) @@ -0,0 +1,48 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; + +/** + * Simple tests for {@link JapaneseKatakanaStemFilterFactory} + */ +public class TestJapaneseKatakanaStemFilterFactory extends BaseTokenStreamTestCase { + public void testKatakanaStemming() throws IOException { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); + Map tokenizerArgs = Collections.emptyMap(); + tokenizerFactory.init(tokenizerArgs); + tokenizerFactory.inform(new StringMockResourceLoader("")); + TokenStream tokenStream = tokenizerFactory.create( + new StringReader("明後日パーティーに行く予定がある。図書館で資料をコピーしました。") + ); + JapaneseKatakanaStemFilterFactory filterFactory = new JapaneseKatakanaStemFilterFactory(); + Map filterArgs = Collections.emptyMap(); + filterFactory.init(filterArgs); + assertTokenStreamContents(filterFactory.create(tokenStream), + new String[]{ "明後日", "パーティ", "に", "行く", "予定", "が", "ある", // パーティー should be stemmed + "図書館", "で", "資料", "を", "コピー", "し", "まし", "た"} // コピー should not be stemmed + ); + } +} Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseKatakanaStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java (working copy) @@ -0,0 +1,55 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests for {@link JapanesePartOfSpeechStopFilterFactory} + */ +public class TestJapanesePartOfSpeechStopFilterFactory extends BaseTokenStreamTestCase { + public void testBasics() throws IOException { + String tags = + "# verb-main:\n" + + "動詞-自立\n"; + + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); + tokenizerFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map tokenizerArgs = Collections.emptyMap(); + tokenizerFactory.init(tokenizerArgs); + tokenizerFactory.inform(new StringMockResourceLoader("")); + TokenStream ts = tokenizerFactory.create(new StringReader("私は制限スピードを超える。")); + JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(); + Map args = new HashMap(); + args.put("tags", "stoptags.txt"); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(args); + factory.inform(new StringMockResourceLoader(tags)); + ts = factory.create(ts); + assertTokenStreamContents(ts, + new String[] { "私", "は", "制限", "スピード", "を" } + ); + } +} Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapanesePartOfSpeechStopFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java (working copy) @@ -0,0 +1,43 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Collections; +import java.util.Map; + +/** + * Simple tests for {@link JapaneseReadingFormFilterFactory} + */ +public class TestJapaneseReadingFormFilterFactory extends BaseTokenStreamTestCase { + public void testReadings() throws IOException { + JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); + Map args = Collections.emptyMap(); + tokenizerFactory.init(args); + tokenizerFactory.inform(new StringMockResourceLoader("")); + TokenStream tokenStream = tokenizerFactory.create(new StringReader("先ほどベルリンから来ました。")); + JapaneseReadingFormFilterFactory filterFactory = new JapaneseReadingFormFilterFactory(); + assertTokenStreamContents(filterFactory.create(tokenStream), + new String[] { "サキ", "ホド", "ベルリン", "カラ", "キ", "マシ", "タ" } + ); + } +} Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java (revision 0) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java (working copy) @@ -0,0 +1,118 @@ +package org.apache.lucene.analysis.ja; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; + +/** + * Simple tests for {@link JapaneseTokenizerFactory} + */ +public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase { + public void testSimple() throws IOException { + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + factory.inform(new StringMockResourceLoader("")); + TokenStream ts = factory.create(new StringReader("これは本ではない")); + assertTokenStreamContents(ts, + new String[] { "これ", "は", "本", "で", "は", "ない" }, + new int[] { 0, 2, 3, 4, 5, 6 }, + new int[] { 2, 3, 4, 5, 6, 8 } + ); + } + + /** + * Test that search mode is enabled and working by default + */ + public void testDefaults() throws IOException { + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + factory.inform(new StringMockResourceLoader("")); + TokenStream ts = factory.create(new StringReader("シニアソフトウェアエンジニア")); + assertTokenStreamContents(ts, + new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" } + ); + } + + /** + * Test mode parameter: specifying normal mode + */ + public void testMode() throws IOException { + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); + Map args = new HashMap(); + args.put("mode", "normal"); + factory.init(args); + factory.inform(new StringMockResourceLoader("")); + TokenStream ts = factory.create(new StringReader("シニアソフトウェアエンジニア")); + assertTokenStreamContents(ts, + new String[] { "シニアソフトウェアエンジニア" } + ); + } + + /** + * Test user dictionary + */ + public void testUserDict() throws IOException { + String userDict = + "# Custom segmentation for long entries\n" + + "日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞\n" + + "関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,テスト名詞\n" + + "# Custom reading for sumo wrestler\n" + + "朝青龍,朝青龍,アサショウリュウ,カスタム人名\n"; + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); + Map args = new HashMap(); + args.put("userDictionary", "userdict.txt"); + factory.init(args); + factory.inform(new StringMockResourceLoader(userDict)); + TokenStream ts = factory.create(new StringReader("関西国際空港に行った")); + assertTokenStreamContents(ts, + new String[] { "関西", "国際", "空港", "に", "行っ", "た" } + ); + } + + /** + * Test preserving punctuation + */ + public void testPreservePunctuation() throws IOException { + JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); + Map args = new HashMap(); + args.put("discardPunctuation", "false"); + factory.init(args); + factory.inform(new StringMockResourceLoader("")); + TokenStream ts = factory.create( + new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。") + ); + System.out.println(ts.toString()); + assertTokenStreamContents(ts, + new String[] { "今", "ノルウェー", "に", "い", "ます", "が", "、", + "来週", "の", "頭", "日本", "に", "戻り", "ます", "。", + "楽しみ", "に", "し", "て", "い", "ます", "!", + "お", "寿司", "が", "食べ", "たい", "な", "。", "。", "。"} + ); + } +} Index: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java =================================================================== --- lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java (revision 1365496) +++ lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java (working copy) Property changes on: lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java =================================================================== --- lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (revision 0) +++ lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (working copy) @@ -0,0 +1,81 @@ +package org.apache.lucene.analysis.morfologik; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.Locale; +import java.util.Map; + +import morfologik.stemming.PolishStemmer.DICTIONARY; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.morfologik.MorfologikFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Filter factory for {@link MorfologikFilter}. + *
+ * <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
+ *   </analyzer>
+ * </fieldType>
+ * + *

Any of Morfologik dictionaries can be used, these are at the moment: + * MORFOLOGIK (Morfologik's original dictionary), + * MORFEUSZ (Morfeusz-SIAT), + * COMBINED (both of the dictionaries above, combined). + * + * @see Morfologik web site + */ +public class MorfologikFilterFactory extends TokenFilterFactory { + /** Dictionary. */ + private DICTIONARY dictionary = DICTIONARY.MORFOLOGIK; + + /** Schema attribute. */ + public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary"; + + /** + * {@inheritDoc} + */ + @Override + public TokenStream create(TokenStream ts) { + return new MorfologikFilter(ts, dictionary, luceneMatchVersion); + } + + /** + * {@inheritDoc} + */ + @Override + public void init(Map args) { + super.init(args); + String dictionaryName = args.get(DICTIONARY_SCHEMA_ATTRIBUTE); + if (dictionaryName != null && !dictionaryName.isEmpty()) { + try { + DICTIONARY dictionary = DICTIONARY.valueOf(dictionaryName.toUpperCase(Locale.ROOT)); + assert dictionary != null; + this.dictionary = dictionary; + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute accepts the " + + "following constants: " + Arrays.toString(DICTIONARY.values()) + ", this value is invalid: " + + dictionaryName); + } + } + } +} Index: lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java =================================================================== --- lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (revision 1365496) +++ lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java (working copy) Property changes on: lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/morfologik/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory =================================================================== --- lucene/analysis/morfologik/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 0) +++ lucene/analysis/morfologik/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy) @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.morfologik.MorfologikFilterFactory Index: lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java =================================================================== --- lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (revision 0) +++ lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (working copy) @@ -0,0 +1,44 @@ +package org.apache.lucene.analysis.morfologik; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Test for {@link MorfologikFilterFactory}. + */ +public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase { + public void testCreateDictionary() throws Exception { + StringReader reader = new StringReader("rowery bilety"); + Map initParams = new HashMap(); + initParams.put(MorfologikFilterFactory.DICTIONARY_SCHEMA_ATTRIBUTE, + "morfologik"); + MorfologikFilterFactory factory = new MorfologikFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + factory.init(initParams); + TokenStream ts = factory.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, + reader)); + assertTokenStreamContents(ts, new String[] {"rower", "bilet"}); + } +} Index: lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java =================================================================== --- lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (revision 1365496) +++ lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java (working copy) Property changes on: lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java (revision 0) +++ lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java (working copy) @@ -0,0 +1,77 @@ +package org.apache.lucene.analysis.phonetic; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Map; + +import org.apache.commons.codec.language.bm.Languages.LanguageSet; +import org.apache.commons.codec.language.bm.NameType; +import org.apache.commons.codec.language.bm.PhoneticEngine; +import org.apache.commons.codec.language.bm.RuleType; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.phonetic.BeiderMorseFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link BeiderMorseFilter}. + *

+ * <fieldType name="text_bm" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.StandardTokenizerFactory"/>
+ *     <filter class="solr.BeiderMorseFilterFactory"
+ *        nameType="GENERIC" ruleType="APPROX" 
+ *        concat="true" languageSet="auto"
+ *     </filter>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class BeiderMorseFilterFactory extends TokenFilterFactory { + private PhoneticEngine engine; + private LanguageSet languageSet; + + public void init(Map args) { + super.init(args); + + // PhoneticEngine = NameType + RuleType + concat + // we use common-codec's defaults: GENERIC + APPROX + true + String nameTypeArg = args.get("nameType"); + NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg); + + String ruleTypeArg = args.get("ruleType"); + RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg); + + boolean concat = getBoolean("concat", true); + engine = new PhoneticEngine(nameType, ruleType, concat); + + // LanguageSet: defaults to automagic, otherwise a comma-separated list. + String languageSetArg = args.get("languageSet"); + if (languageSetArg == null || languageSetArg.equals("auto")) { + languageSet = null; + } else { + languageSet = LanguageSet.from(new HashSet(Arrays.asList(languageSetArg.split(",")))); + } + } + + @Override + public TokenStream create(TokenStream input) { + return new BeiderMorseFilter(input, engine, languageSet); + } +} Index: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java (revision 1365496) +++ lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java (working copy) Property changes on: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/BeiderMorseFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java (revision 0) +++ lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java (working copy) @@ -0,0 +1,61 @@ +package org.apache.lucene.analysis.phonetic; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link DoubleMetaphoneFilter}. + *
+ * <fieldType name="text_dblmtphn" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.DoubleMetaphoneFilterFactory" inject="true" maxCodeLength="4"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class DoubleMetaphoneFilterFactory extends TokenFilterFactory +{ + public static final String INJECT = "inject"; + public static final String MAX_CODE_LENGTH = "maxCodeLength"; + + public static final int DEFAULT_MAX_CODE_LENGTH = 4; + + private boolean inject = true; + private int maxCodeLength = DEFAULT_MAX_CODE_LENGTH; + + @Override + public void init(Map args) { + super.init(args); + + inject = getBoolean(INJECT, true); + + if (args.get(MAX_CODE_LENGTH) != null) { + maxCodeLength = Integer.parseInt(args.get(MAX_CODE_LENGTH)); + } + } + + public DoubleMetaphoneFilter create(TokenStream input) { + return new DoubleMetaphoneFilter(input, maxCodeLength, inject); + } +} Index: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java (revision 1365496) +++ lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java (working copy) Property changes on: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterFactory.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java (revision 0) +++ lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java (working copy) @@ -0,0 +1,149 @@ +package org.apache.lucene.analysis.phonetic; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.lang.reflect.Method; +import java.lang.reflect.InvocationTargetException; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +import org.apache.commons.codec.Encoder; +import org.apache.commons.codec.language.*; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.phonetic.PhoneticFilter; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link PhoneticFilter}. + * + * Create tokens based on phonetic encoders from Apache Commons Codec. + *

+ * This takes one required argument, "encoder", and the rest are optional: + *

+ *
encoder
required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0), + * or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by + * itself if it already contains a '.' or otherwise as in the same package as these others. + *
inject
(default=true) add tokens to the stream with the offset=0 + *
maxCodeLength
The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't + * support this then specifying this is an error. + *
+ * + *
+ * <fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.PhoneticFilterFactory" encoder="DoubleMetaphone" inject="true"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @see PhoneticFilter + */ +public class PhoneticFilterFactory extends TokenFilterFactory +{ + public static final String ENCODER = "encoder"; + public static final String INJECT = "inject"; // boolean + public static final String MAX_CODE_LENGTH = "maxCodeLength"; + private static final String PACKAGE_CONTAINING_ENCODERS = "org.apache.commons.codec.language."; + + //Effectively constants; uppercase keys + private static final Map> registry = new HashMap>(6); + + static { + registry.put("DoubleMetaphone".toUpperCase(Locale.ROOT), DoubleMetaphone.class); + registry.put("Metaphone".toUpperCase(Locale.ROOT), Metaphone.class); + registry.put("Soundex".toUpperCase(Locale.ROOT), Soundex.class); + registry.put("RefinedSoundex".toUpperCase(Locale.ROOT), RefinedSoundex.class); + registry.put("Caverphone".toUpperCase(Locale.ROOT), Caverphone2.class); + registry.put("ColognePhonetic".toUpperCase(Locale.ROOT), ColognePhonetic.class); + } + + protected boolean inject = true; + protected String name = null; + protected Class clazz = null; + protected Method setMaxCodeLenMethod = null; + protected Integer maxCodeLength = null; + + @Override + public void init(Map args) { + super.init( args ); + + inject = getBoolean(INJECT, true); + + String name = args.get( ENCODER ); + if( name == null ) { + throw new InitializationException("Missing required parameter: " + ENCODER + + " [" + registry.keySet() + "]"); + } + clazz = registry.get(name.toUpperCase(Locale.ROOT)); + if( clazz == null ) { + clazz = resolveEncoder(name); + } + + String v = args.get(MAX_CODE_LENGTH); + if (v != null) { + maxCodeLength = Integer.valueOf(v); + try { + setMaxCodeLenMethod = clazz.getMethod("setMaxCodeLen", int.class); + } catch (Exception e) { + throw new InitializationException("Encoder " + name + " / " + clazz + " does not support " + MAX_CODE_LENGTH, e); + } + } + + getEncoder();//trigger initialization for potential problems to be thrown now + } + + private Class resolveEncoder(String name) { + String lookupName = name; + if (name.indexOf('.') == -1) { + lookupName = PACKAGE_CONTAINING_ENCODERS + name; + } + try { + return Class.forName(lookupName).asSubclass(Encoder.class); + } catch (ClassNotFoundException cnfe) { + throw new InitializationException("Unknown encoder: " + name + " must be full class name or one of " + registry.keySet(), cnfe); + } catch (ClassCastException e) { + throw new InitializationException("Not an encoder: " + name + " must be full class name or one of " + registry.keySet(), e); + } + } + + /** Must be thread-safe. */ + protected Encoder getEncoder() { + // Unfortunately, Commons-Codec doesn't offer any thread-safe guarantees so we must play it safe and instantiate + // every time. A simple benchmark showed this as negligible. + try { + Encoder encoder = clazz.newInstance(); + // Try to set the maxCodeLength + if(maxCodeLength != null && setMaxCodeLenMethod != null) { + setMaxCodeLenMethod.invoke(encoder, maxCodeLength); + } + return encoder; + } catch (Exception e) { + final Throwable t = (e instanceof InvocationTargetException) ? e.getCause() : e; + throw new InitializationException("Error initializing encoder: " + name + " / " + clazz, t); + } + } + + public PhoneticFilter create(TokenStream input) { + return new PhoneticFilter(input, getEncoder(), inject); + } + +} Index: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java (revision 1365496) +++ lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java (working copy) Property changes on: lucene/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +HeadURL \ No newline at end of property Index: lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory =================================================================== --- lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 0) +++ lucene/analysis/phonetic/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy) @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.phonetic.BeiderMorseFilterFactory +org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilterFactory +org.apache.lucene.analysis.phonetic.PhoneticFilterFactory Index: lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java (revision 0) +++ lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java (working copy) @@ -0,0 +1,70 @@ +package org.apache.lucene.analysis.phonetic; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; + +/** Simple tests for {@link BeiderMorseFilterFactory} */ +public class TestBeiderMorseFilterFactory extends BaseTokenStreamTestCase { + public void testBasics() throws Exception { + BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(); + factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); + Map args = Collections.emptyMap(); + factory.init(args); + TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(ts, + new String[] { "vDnbirk", "vanbirk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" }, + new int[] { 0, 0, 0, 0, 0, 0 }, + new int[] { 8, 8, 8, 8, 8, 8 }, + new int[] { 1, 0, 0, 0, 0, 0 }); + } + + public void testLanguageSet() throws Exception { + BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(); + Map args = new HashMap(); + args.put("languageSet", "polish"); + factory.init(args); + TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(ts, + new String[] { "vDmbYrk", "vDmbirk", "vambYrk", "vambirk", "vimbYrk", "vimbirk" }, + new int[] { 0, 0, 0, 0, 0, 0 }, + new int[] { 8, 8, 8, 8, 8, 8 }, + new int[] { 1, 0, 0, 0, 0, 0 }); + } + + public void testOptions() throws Exception { + BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(); + Map args = new HashMap(); + args.put("nameType", "ASHKENAZI"); + args.put("ruleType", "EXACT"); + factory.init(args); + TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false)); + assertTokenStreamContents(ts, + new String[] { "vajnberk" }, + new int[] { 0 }, + new int[] { 8 }, + new int[] { 1 }); + } +} Index: lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java (revision 1365496) +++ lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java (working copy) Property changes on: lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestBeiderMorseFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java (revision 0) +++ lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java (working copy) @@ -0,0 +1,77 @@ +package org.apache.lucene.analysis.phonetic; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +public class TestDoubleMetaphoneFilterFactory extends BaseTokenStreamTestCase { + + public void testDefaults() throws Exception { + DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); + factory.init(new HashMap()); + TokenStream inputStream = new MockTokenizer(new StringReader("international"), MockTokenizer.WHITESPACE, false); + + TokenStream filteredStream = factory.create(inputStream); + assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); + assertTokenStreamContents(filteredStream, new String[] { "international", "ANTR" }); + } + + public void testSettingSizeAndInject() throws Exception { + DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); + Map parameters = new HashMap(); + parameters.put("inject", "false"); + parameters.put("maxCodeLength", "8"); + factory.init(parameters); + + TokenStream inputStream = new MockTokenizer(new StringReader("international"), MockTokenizer.WHITESPACE, false); + + TokenStream filteredStream = factory.create(inputStream); + assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); + assertTokenStreamContents(filteredStream, new String[] { "ANTRNXNL" }); + } + + /** + * Ensure that reset() removes any state (buffered tokens) + */ + public void testReset() throws Exception { + DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); + factory.init(new HashMap()); + TokenStream inputStream = new MockTokenizer(new StringReader("international"), MockTokenizer.WHITESPACE, false); + + TokenStream filteredStream = factory.create(inputStream); + CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class); + assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); + + filteredStream.reset(); + assertTrue(filteredStream.incrementToken()); + assertEquals(13, termAtt.length()); + assertEquals("international", termAtt.toString()); + filteredStream.reset(); + + // ensure there are no more tokens, such as ANTRNXNL + assertFalse(filteredStream.incrementToken()); + } +} Index: lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java (revision 1365496) +++ lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java (working copy) Property changes on: lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestDoubleMetaphoneFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java =================================================================== --- lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java (revision 0) +++ lucene/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilterFactory.java (working copy) @@ -0,0 +1,185 @@ +package org.apache.lucene.analysis.phonetic; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.codec.language.Metaphone; +import org.apache.commons.codec.language.Caverphone2; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.util.LuceneTestCase.Slow; + + +/** + * + */ +@Slow +public class TestPhoneticFilterFactory extends BaseTokenStreamTestCase { + + private static final int REPEATS = 100000; + + /** + * Case: default + */ + public void testFactory() + { + Map args = new HashMap(); + + PhoneticFilterFactory ff = new PhoneticFilterFactory(); + + args.put( PhoneticFilterFactory.ENCODER, "Metaphone" ); + ff.init( args ); + assertTrue( ff.getEncoder() instanceof Metaphone ); + assertTrue( ff.inject ); // default + + args.put( PhoneticFilterFactory.INJECT, "false" ); + ff.init( args ); + assertFalse( ff.inject ); + + args.put( PhoneticFilterFactory.MAX_CODE_LENGTH, "2"); + ff.init( args ); + assertEquals(2,((Metaphone) ff.getEncoder()).getMaxCodeLen()); + } + + /** + * Case: Failures and Exceptions + */ + public void testFactoryCaseFailure() + { + Map args = new HashMap(); + + PhoneticFilterFactory ff = new PhoneticFilterFactory(); + try { + ff.init( args ); + fail( "missing encoder parameter" ); + } + catch( Exception ex ) {} + args.put( PhoneticFilterFactory.ENCODER, "XXX" ); + try { + ff.init( args ); + fail( "unknown encoder parameter" ); + } + catch( Exception ex ) {} + args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.NonExistence" ); + try { + ff.init( args ); + fail( "unknown encoder parameter" ); + } + catch( Exception ex ) {} + } + + /** + * Case: Reflection + */ + public void testFactoryCaseReflection() + { + Map args = new HashMap(); + + PhoneticFilterFactory ff = new PhoneticFilterFactory(); + + args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone" ); + ff.init( args ); + assertTrue( ff.getEncoder() instanceof Metaphone ); + assertTrue( ff.inject ); // default + + // we use "Caverphone2" as it is registered in the REGISTRY as Caverphone, + // so this effectively tests reflection without package name + args.put( PhoneticFilterFactory.ENCODER, "Caverphone2" ); + ff.init( args ); + assertTrue( ff.getEncoder() instanceof Caverphone2 ); + assertTrue( ff.inject ); // default + + // cross check with registry + args.put( PhoneticFilterFactory.ENCODER, "Caverphone" ); + ff.init( args ); + assertTrue( ff.getEncoder() instanceof Caverphone2 ); + assertTrue( ff.inject ); // default + } + + public void testAlgorithms() throws Exception { + assertAlgorithm("Metaphone", "true", "aaa bbb ccc easgasg", + new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" }); + assertAlgorithm("Metaphone", "false", "aaa bbb ccc easgasg", + new String[] { "A", "B", "KKK", "ESKS" }); + + assertAlgorithm("DoubleMetaphone", "true", "aaa bbb ccc easgasg", + new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" }); + assertAlgorithm("DoubleMetaphone", "false", "aaa bbb ccc easgasg", + new String[] { "A", "PP", "KK", "ASKS" }); + + assertAlgorithm("Soundex", "true", "aaa bbb ccc easgasg", + new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" }); + assertAlgorithm("Soundex", "false", "aaa bbb ccc easgasg", + new String[] { "A000", "B000", "C000", "E220" }); + + assertAlgorithm("RefinedSoundex", "true", "aaa bbb ccc easgasg", + new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" }); + assertAlgorithm("RefinedSoundex", "false", "aaa bbb ccc easgasg", + new String[] { "A0", "B1", "C3", "E034034" }); + + assertAlgorithm("Caverphone", "true", "Darda Karleen Datha Carlene", + new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen", + "TTA1111111", "Datha", "KLN1111111", "Carlene" }); + assertAlgorithm("Caverphone", "false", "Darda Karleen Datha Carlene", + new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" }); + + assertAlgorithm("ColognePhonetic", "true", "Meier Schmitt Meir Schmidt", + new String[] { "67", "Meier", "862", "Schmitt", + "67", "Meir", "862", "Schmidt" }); + assertAlgorithm("ColognePhonetic", "false", "Meier Schmitt Meir Schmidt", + new String[] { "67", "862", "67", "862" }); + } + + static void assertAlgorithm(String algName, String inject, String input, + String[] expected) throws Exception { + Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); + Map args = new HashMap(); + args.put("encoder", algName); + args.put("inject", inject); + PhoneticFilterFactory factory = new PhoneticFilterFactory(); + factory.init(args); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, expected); + } + + public void testSpeed() throws Exception { + checkSpeedEncoding("Metaphone", "easgasg", "ESKS"); + checkSpeedEncoding("DoubleMetaphone", "easgasg", "ASKS"); + checkSpeedEncoding("Soundex", "easgasg", "E220"); + checkSpeedEncoding("RefinedSoundex", "easgasg", "E034034"); + checkSpeedEncoding("Caverphone", "Carlene", "KLN1111111"); + checkSpeedEncoding("ColognePhonetic", "Schmitt", "862"); + } + + private void checkSpeedEncoding(String encoder, String toBeEncoded, String estimated) throws Exception { + long start = System.currentTimeMillis(); + for ( int i=0; i + * Note: this class will currently emit tokens for punctuation. So you should either add + * a WordDelimiterFilter after to remove these (with concatenate off), or use the + * SmartChinese stoplist with a StopFilterFactory via: + * words="org/apache/lucene/analysis/cn/smart/stopwords.txt" + * @lucene.experimental + */ +public class SmartChineseWordTokenFilterFactory extends TokenFilterFactory { + public TokenFilter create(TokenStream input) { + return new WordTokenFilter(input); + } +} Index: lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseWordTokenFilterFactory.java =================================================================== --- lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseWordTokenFilterFactory.java (revision 1365496) +++ lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseWordTokenFilterFactory.java (working copy) Property changes on: lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseWordTokenFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory =================================================================== --- lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 0) +++ lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy) @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.cn.smart.SmartChineseWordTokenFilterFactory Index: lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory =================================================================== --- lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (revision 0) +++ lucene/analysis/smartcn/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (working copy) @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.cn.smart.SmartChineseSentenceTokenizerFactory Index: lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java =================================================================== --- lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java (revision 0) +++ lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java (working copy) @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.cn.smart; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Tests for {@link SmartChineseSentenceTokenizerFactory} and + * {@link SmartChineseWordTokenFilterFactory} + */ +public class TestSmartChineseFactories extends BaseTokenStreamTestCase { + /** Test showing the behavior with whitespace */ + public void testSimple() throws Exception { + String sentence = "我购买了道具和服装。"; + WhitespaceTokenizer ws = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(sentence)); + SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory(); + TokenStream ts = factory.create(ws); + // TODO: fix smart chinese to not emit punctuation tokens + // at the moment: you have to clean up with WDF, or use the stoplist, etc + assertTokenStreamContents(ts, + new String[] { "我", "购买", "了", "道具", "和", "服装", "," }); + } + + /** Test showing the behavior with whitespace */ + public void testTokenizer() throws Exception { + String sentence = "我购买了道具和服装。我购买了道具和服装。"; + SmartChineseSentenceTokenizerFactory tokenizerFactory = new SmartChineseSentenceTokenizerFactory(); + Tokenizer tokenizer = tokenizerFactory.create(new StringReader(sentence)); + SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory(); + TokenStream ts = factory.create(tokenizer); + // TODO: fix smart chinese to not emit punctuation tokens + // at the moment: you have to clean up with WDF, or use the stoplist, etc + assertTokenStreamContents(ts, + new String[] { "我", "购买", "了", "道具", "和", "服装", ",", + "我", "购买", "了", "道具", "和", "服装", "," + }); + } +} Index: lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java =================================================================== --- lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java (revision 1365496) +++ lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java (working copy) Property changes on: lucene/analysis/smartcn/src/test/org/apache/lucene/analysis/cn/smart/TestSmartChineseFactories.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java =================================================================== --- lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java (revision 0) +++ lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java (working copy) @@ -0,0 +1,49 @@ +package org.apache.lucene.analysis.stempel; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.stempel.StempelFilter; +import org.apache.lucene.analysis.stempel.StempelStemmer; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.egothor.stemmer.Trie; + +/** + * Factory for {@link StempelFilter} using a Polish stemming table. + */ +public class StempelPolishStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + private Trie stemmer = null; + private static final String STEMTABLE = "/org/apache/lucene/analysis/pl/stemmer_20000.tbl"; + + public TokenStream create(TokenStream input) { + return new StempelFilter(input, new StempelStemmer(stemmer)); + } + + public void inform(ResourceLoader loader) { + try { + stemmer = StempelStemmer.load(loader.openResource(STEMTABLE)); + } catch (IOException e) { + throw new InitializationException("Could not load stem table: " + STEMTABLE, e); + } + } +} Index: lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java =================================================================== --- lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java (revision 1365496) +++ lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java (working copy) Property changes on: lucene/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelPolishStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/stempel/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory =================================================================== --- lucene/analysis/stempel/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (revision 0) +++ lucene/analysis/stempel/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (working copy) @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.stempel.StempelPolishStemFilterFactory Index: lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/ResourceAsStreamResourceLoader.java =================================================================== --- lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/ResourceAsStreamResourceLoader.java (revision 0) +++ lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/ResourceAsStreamResourceLoader.java (working copy) @@ -0,0 +1,85 @@ +package org.apache.lucene.analysis.stempel; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.CodingErrorAction; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.util.IOUtils; + +public class ResourceAsStreamResourceLoader implements ResourceLoader { + Class clazz; + + public ResourceAsStreamResourceLoader(Class clazz) { + this.clazz = clazz; + } + + @Override + public InputStream openResource(String resource) throws IOException { + return clazz.getResourceAsStream(resource); + } + + @Override + public List getLines(String resource) throws IOException { + BufferedReader input = null; + ArrayList lines; + try { + input = new BufferedReader(new InputStreamReader(openResource(resource), + IOUtils.CHARSET_UTF_8.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT))); + + lines = new ArrayList(); + for (String word=null; (word=input.readLine())!=null;) { + // skip initial bom marker + if (lines.isEmpty() && word.length() > 0 && word.charAt(0) == '\uFEFF') + word = word.substring(1); + // skip comments + if (word.startsWith("#")) continue; + word=word.trim(); + // skip blank lines + if (word.length()==0) continue; + lines.add(word); + } + } catch (CharacterCodingException ex) { + throw new RuntimeException("Error loading resource (wrong encoding?): " + resource, ex); + } finally { + if (input != null) + input.close(); + } + return lines; + } + + // TODO: do this subpackages thing... wtf is that? + @Override + public T newInstance(String cname, Class expectedType, String... subpackages) { + try { + Class clazz = Class.forName(cname).asSubclass(expectedType); + return clazz.newInstance(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} Index: lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/ResourceAsStreamResourceLoader.java =================================================================== --- lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/ResourceAsStreamResourceLoader.java (revision 1365496) +++ lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/ResourceAsStreamResourceLoader.java (working copy) Property changes on: lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/ResourceAsStreamResourceLoader.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java =================================================================== --- lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java (revision 0) +++ lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java (working copy) @@ -0,0 +1,38 @@ +package org.apache.lucene.analysis.stempel; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; + +/** + * Tests for {@link StempelPolishStemFilterFactory} + */ +public class TestStempelPolishStemFilterFactory extends BaseTokenStreamTestCase { + public void testBasics() throws Exception { + StringReader document = new StringReader("studenta studenci"); + StempelPolishStemFilterFactory factory = new StempelPolishStemFilterFactory(); + factory.inform(new ResourceAsStreamResourceLoader(getClass())); + TokenStream ts = factory.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, document)); + assertTokenStreamContents(ts, + new String[] { "student", "student" }); + } +} Index: lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java =================================================================== --- lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java (revision 1365496) +++ lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java (working copy) Property changes on: lucene/analysis/stempel/src/test/org/apache/lucene/analysis/stempel/TestStempelPolishStemFilterFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizerFactory.java =================================================================== --- lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizerFactory.java (revision 0) +++ lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizerFactory.java (working copy) @@ -0,0 +1,50 @@ +package org.apache.lucene.analysis.uima; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizer; + +import java.io.Reader; +import java.util.Map; + +/** + * {@link org.apache.lucene.analysis.util.TokenizerFactory} for {@link UIMAAnnotationsTokenizer} + */ +public class UIMAAnnotationsTokenizerFactory extends TokenizerFactory { + + private String descriptorPath; + private String tokenType; + + @Override + public void init(Map args) { + super.init(args); + descriptorPath = args.get("descriptorPath"); + tokenType = args.get("tokenType"); + if (descriptorPath == null || tokenType == null) { + throw new InitializationException("Both descriptorPath and tokenType are mandatory"); + } + } + + @Override + public Tokenizer create(Reader input) { + return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, input); + } +} Index: lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizerFactory.java =================================================================== --- lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizerFactory.java (revision 1365496) +++ lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizerFactory.java (working copy) Property changes on: lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizerFactory.java =================================================================== --- lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizerFactory.java (revision 0) +++ lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizerFactory.java (working copy) @@ -0,0 +1,52 @@ +package org.apache.lucene.analysis.uima; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.uima.UIMATypeAwareAnnotationsTokenizer; +import org.apache.lucene.analysis.util.InitializationException; +import org.apache.lucene.analysis.util.TokenizerFactory; + +import java.io.Reader; +import java.util.Map; + +/** + * {@link org.apache.lucene.analysis.util.TokenizerFactory} for {@link UIMATypeAwareAnnotationsTokenizer} + */ +public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory { + + private String descriptorPath; + private String tokenType; + private String featurePath; + + @Override + public void init(Map args) { + super.init(args); + descriptorPath = args.get("descriptorPath"); + tokenType = args.get("tokenType"); + featurePath = args.get("featurePath"); + if (descriptorPath == null || tokenType == null || featurePath == null) { + throw new InitializationException("descriptorPath, tokenType, and featurePath are mandatory"); + } + } + + @Override + public Tokenizer create(Reader input) { + return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, input); + } +} Index: lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizerFactory.java =================================================================== --- lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizerFactory.java (revision 1365496) +++ lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizerFactory.java (working copy) Property changes on: lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizerFactory.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/analysis/uima/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory =================================================================== --- lucene/analysis/uima/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (revision 0) +++ lucene/analysis/uima/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory (working copy) @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizerFactory +org.apache.lucene.analysis.uima.UIMATypeAwareAnnotationsTokenizerFactory Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1365483) +++ lucene/CHANGES.txt (working copy) Property changes on: lucene/CHANGES.txt ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/lucene/CHANGES.txt:r1364862-1365496 Index: lucene/core =================================================================== --- lucene/core (revision 1365483) +++ lucene/core (working copy) Property changes on: lucene/core ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/lucene/core:r1364862-1365496 Index: lucene/core/src/java/org/apache/lucene/util/NamedSPILoader.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/NamedSPILoader.java (revision 1365483) +++ lucene/core/src/java/org/apache/lucene/util/NamedSPILoader.java (working copy) @@ -22,7 +22,7 @@ import java.util.Map; import java.util.LinkedHashMap; import java.util.Set; -import java.util.ServiceLoader; +import java.util.ServiceConfigurationError; /** * Helper class for loading named SPIs from classpath (e.g. Codec, PostingsFormat). @@ -32,64 +32,53 @@ public final class NamedSPILoader implements Iterable { private final Map services; - - /** This field is a hack for LuceneTestCase to get access - * to the modifiable map (to work around bugs in IBM J9) */ - @SuppressWarnings("unused") - @Deprecated - // Hackidy-Häck-Hack for bugs in IBM J9 ServiceLoader - private final Map modifiableServices; - private final Class clazz; public NamedSPILoader(Class clazz) { this.clazz = clazz; - final ServiceLoader loader = ServiceLoader.load(clazz); + final SPIClassIterator loader = SPIClassIterator.get(clazz); final LinkedHashMap services = new LinkedHashMap(); - for (final S service : loader) { - final String name = service.getName(); - // only add the first one for each name, later services will be ignored - // this allows to place services before others in classpath to make - // them used instead of others - if (!services.containsKey(name)) { - assert checkServiceName(name); - services.put(name, service); + while (loader.hasNext()) { + final Class c = loader.next(); + try { + final S service = c.newInstance(); + final String name = service.getName(); + // only add the first one for each name, later services will be ignored + // this allows to place services before others in classpath to make + // them used instead of others + if (!services.containsKey(name)) { + checkServiceName(name); + services.put(name, service); + } + } catch (Exception e) { + throw new ServiceConfigurationError("Cannot instantiate SPI class: " + c.getName(), e); } } - this.modifiableServices = services; // hack, remove when IBM J9 is fixed! this.services = Collections.unmodifiableMap(services); } /** * Validates that a service name meets the requirements of {@link NamedSPI} */ - public static boolean checkServiceName(String name) { + public static void checkServiceName(String name) { // based on harmony charset.java if (name.length() >= 128) { throw new IllegalArgumentException("Illegal service name: '" + name + "' is too long (must be < 128 chars)."); } - for (int i = 0; i < name.length(); i++) { + for (int i = 0, len = name.length(); i < len; i++) { char c = name.charAt(i); - if (!isLetter(c) && !isDigit(c)) { + if (!isLetterOrDigit(c)) { throw new IllegalArgumentException("Illegal service name: '" + name + "' must be simple ascii alphanumeric."); } } - return true; } - /* - * Checks whether a character is a letter (ascii) which are defined in the spec. + /** + * Checks whether a character is a letter or digit (ascii) which are defined in the spec. */ - private static boolean isLetter(char c) { - return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); + private static boolean isLetterOrDigit(char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9'); } - - /* - * Checks whether a character is a digit (ascii) which are defined in the spec. - */ - private static boolean isDigit(char c) { - return ('0' <= c && c <= '9'); - } public S lookup(String name) { final S service = services.get(name); Index: lucene/core/src/java/org/apache/lucene/util/SPIClassIterator.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/SPIClassIterator.java (revision 0) +++ lucene/core/src/java/org/apache/lucene/util/SPIClassIterator.java (working copy) @@ -0,0 +1,138 @@ +package org.apache.lucene.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.net.URL; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Enumeration; +import java.util.Iterator; +import java.util.Locale; +import java.util.NoSuchElementException; +import java.util.ServiceConfigurationError; + +/** + * Helper class for loading SPI classes from classpath (META-INF files). + * This is a light impl of {@link java.util.ServiceLoader} but is guaranteed to + * be bug-free regarding classpath order and does not instantiate or initialize + * the classes found. + * + * @lucene.internal + */ +public final class SPIClassIterator implements Iterator> { + private static final String META_INF_SERVICES = "META-INF/services/"; + + private final Class clazz; + private final ClassLoader loader; + private final Enumeration profilesEnum; + private Iterator linesIterator; + + public static SPIClassIterator get(Class clazz) { + return new SPIClassIterator(clazz, Thread.currentThread().getContextClassLoader()); + } + + public static SPIClassIterator get(Class clazz, ClassLoader loader) { + return new SPIClassIterator(clazz, loader); + } + + private SPIClassIterator(Class clazz, ClassLoader loader) { + if (loader == null) + throw new IllegalArgumentException("You must provide a ClassLoader."); + this.clazz = clazz; + this.loader = loader; + try { + this.profilesEnum = loader.getResources(META_INF_SERVICES + clazz.getName()); + } catch (IOException ioe) { + throw new ServiceConfigurationError("Error loading SPI profiles for type " + clazz.getName() + " from classpath", ioe); + } + this.linesIterator = Collections.emptySet().iterator(); + } + + private boolean loadNextProfile() { + ArrayList lines = null; + while (profilesEnum.hasMoreElements()) { + if (lines != null) { + lines.clear(); + } else { + lines = new ArrayList(); + } + final URL url = profilesEnum.nextElement(); + try { + final InputStream in = url.openStream(); + IOException priorE = null; + try { + final BufferedReader reader = new BufferedReader(new InputStreamReader(in, IOUtils.CHARSET_UTF_8)); + String line; + while ((line = reader.readLine()) != null) { + final int pos = line.indexOf('#'); + if (pos >= 0) { + line = line.substring(0, pos); + } + line = line.trim(); + if (line.length() > 0) { + lines.add(line); + } + } + } catch (IOException ioe) { + priorE = ioe; + } finally { + IOUtils.closeWhileHandlingException(priorE, in); + } + } catch (IOException ioe) { + throw new ServiceConfigurationError("Error loading SPI class list from URL: " + url, ioe); + } + if (!lines.isEmpty()) { + this.linesIterator = lines.iterator(); + return true; + } + } + return false; + } + + @Override + public boolean hasNext() { + return linesIterator.hasNext() || loadNextProfile(); + } + + @Override + public Class next() { + // hasNext() implicitely loads the next profile, so it is essential to call this here! + if (!hasNext()) { + throw new NoSuchElementException(); + } + assert linesIterator.hasNext(); + final String c = linesIterator.next(); + try { + // don't initialize the class (pass false as 2nd parameter): + return Class.forName(c, false, loader).asSubclass(clazz); + } catch (ClassNotFoundException cnfe) { + throw new ServiceConfigurationError(String.format(Locale.ROOT, "A SPI class of type %s with classname %s does not exist, "+ + "please fix the file '%s%1$s' in your classpath.", clazz.getName(), c, META_INF_SERVICES)); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + +} Index: lucene/core/src/java/org/apache/lucene/util/SPIClassIterator.java =================================================================== --- lucene/core/src/java/org/apache/lucene/util/SPIClassIterator.java (revision 1365496) +++ lucene/core/src/java/org/apache/lucene/util/SPIClassIterator.java (working copy) Property changes on: lucene/core/src/java/org/apache/lucene/util/SPIClassIterator.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +Date Author Id Revision HeadURL \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: lucene/misc =================================================================== --- lucene/misc (revision 1365483) +++ lucene/misc (working copy) Property changes on: lucene/misc ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/lucene/misc:r1364862-1365496 Index: lucene/spatial =================================================================== --- lucene/spatial (revision 1365483) +++ lucene/spatial (working copy) Property changes on: lucene/spatial ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/lucene/spatial:r1364862-1365496 Index: lucene/test-framework =================================================================== --- lucene/test-framework (revision 1365483) +++ lucene/test-framework (working copy) Property changes on: lucene/test-framework ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/lucene/test-framework:r1364862-1365496 Index: lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java =================================================================== --- lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java (revision 1365483) +++ lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java (working copy) @@ -87,33 +87,6 @@ restoreProperties.put("solr.solr.home", System.getProperty("solr.solr.home")); restoreProperties.put("solr.data.dir", System.getProperty("solr.data.dir")); - // enable the Lucene 3.x PreflexRW codec explicitly, to work around bugs in IBM J9 / Harmony ServiceLoader: - try { - final java.lang.reflect.Field spiLoaderField = Codec.class.getDeclaredField("loader"); - spiLoaderField.setAccessible(true); - final Object spiLoader = spiLoaderField.get(null); - final java.lang.reflect.Field modifiableServicesField = NamedSPILoader.class.getDeclaredField("modifiableServices"); - modifiableServicesField.setAccessible(true); - /* note: re-enable this if we make a Lucene4x impersonator - @SuppressWarnings({"unchecked","rawtypes"}) final Map serviceMap = - (Map) modifiableServicesField.get(spiLoader); - if (!(Codec.forName("Lucene3x") instanceof PreFlexRWCodec)) { - if (Constants.JAVA_VENDOR.startsWith("IBM")) { - // definitely a buggy version - System.err.println("ERROR: Your VM's java.util.ServiceLoader implementation is buggy"+ - " and does not respect classpath order, please report this to the vendor."); - } else { - // could just be a classpath issue - System.err.println("ERROR: fix your classpath to have tests-framework.jar before lucene-core.jar!"+ - " If you have already done this, then your VM's java.util.ServiceLoader implementation is buggy"+ - " and does not respect classpath order, please report this to the vendor."); - } - serviceMap.put("Lucene3x", new PreFlexRWCodec()); - } */ - } catch (Exception e) { - throw new RuntimeException("Cannot access internals of Codec and NamedSPILoader classes", e); - } - // if verbose: print some debugging stuff about which codecs are loaded. if (VERBOSE) { Set codecs = Codec.availableCodecs(); Index: solr =================================================================== --- solr (revision 1365483) +++ solr (working copy) Property changes on: solr ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/solr:r1364862-1365496 Index: solr/build.xml =================================================================== --- solr/build.xml (revision 1365483) +++ solr/build.xml (working copy) Property changes on: solr/build.xml ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/solr/build.xml:r1364862-1365496 Index: solr/CHANGES.txt =================================================================== --- solr/CHANGES.txt (revision 1365483) +++ solr/CHANGES.txt (working copy) Property changes on: solr/CHANGES.txt ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/solr/CHANGES.txt:r1364862-1365496 Index: solr/contrib =================================================================== --- solr/contrib (revision 1365483) +++ solr/contrib (working copy) Property changes on: solr/contrib ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/solr/contrib:r1364862-1365496 Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java (working copy) @@ -1,82 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import java.util.Arrays; -import java.util.Locale; -import java.util.Map; - -import morfologik.stemming.PolishStemmer.DICTIONARY; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.morfologik.MorfologikFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Filter factory for {@link MorfologikFilter}. - *
- * <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
- *   </analyzer>
- * </fieldType>
- * - *

Any of Morfologik dictionaries can be used, these are at the moment: - * MORFOLOGIK (Morfologik's original dictionary), - * MORFEUSZ (Morfeusz-SIAT), - * COMBINED (both of the dictionaries above, combined). - * - * @see Morfologik web site - */ -public class MorfologikFilterFactory extends TokenFilterFactory { - /** Dictionary. */ - private DICTIONARY dictionary = DICTIONARY.MORFOLOGIK; - - /** Schema attribute. */ - public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary"; - - /** - * {@inheritDoc} - */ - @Override - public TokenStream create(TokenStream ts) { - return new MorfologikFilter(ts, dictionary, luceneMatchVersion); - } - - /** - * {@inheritDoc} - */ - @Override - public void init(Map args) { - super.init(args); - String dictionaryName = args.get(DICTIONARY_SCHEMA_ATTRIBUTE); - if (dictionaryName != null && !dictionaryName.isEmpty()) { - try { - DICTIONARY dictionary = DICTIONARY.valueOf(dictionaryName.toUpperCase(Locale.ROOT)); - assert dictionary != null; - this.dictionary = dictionary; - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute accepts the " - + "following constants: " + Arrays.toString(DICTIONARY.values()) + ", this value is invalid: " - + dictionaryName); - } - } - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java (working copy) @@ -1,74 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.icu.ICUTransformFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrException.ErrorCode; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import com.ibm.icu.text.Transliterator; - -/** - * Factory for {@link ICUTransformFilter}. - *

- * Supports the following attributes: - *

    - *
  • id (mandatory): A Transliterator ID, one from {@link Transliterator#getAvailableIDs()} - *
  • direction (optional): Either 'forward' or 'reverse'. Default is forward. - *
- * @see Transliterator - */ -public class ICUTransformFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - private Transliterator transliterator; - - // TODO: add support for custom rules - @Override - public void init(Map args) { - super.init(args); - String id = args.get("id"); - if (id == null) { - throw new SolrException(ErrorCode.SERVER_ERROR, "id is required."); - } - - int dir; - String direction = args.get("direction"); - if (direction == null || direction.equalsIgnoreCase("forward")) - dir = Transliterator.FORWARD; - else if (direction.equalsIgnoreCase("reverse")) - dir = Transliterator.REVERSE; - else - throw new SolrException(ErrorCode.SERVER_ERROR, "invalid direction: " + direction); - - transliterator = Transliterator.getInstance(id, dir); - } - - public TokenStream create(TokenStream input) { - return new ICUTransformFilter(input, transliterator); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java (working copy) @@ -1,34 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.cn.smart.SentenceTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -/** - * Factory for the SmartChineseAnalyzer {@link SentenceTokenizer} - * @lucene.experimental - */ -public class SmartChineseSentenceTokenizerFactory extends TokenizerFactory { - public Tokenizer create(Reader input) { - return new SentenceTokenizer(input); - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java (working copy) @@ -1,50 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.stempel.StempelFilter; -import org.apache.lucene.analysis.stempel.StempelStemmer; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrException.ErrorCode; -import org.apache.lucene.analysis.util.ResourceLoaderAware; -import org.apache.lucene.analysis.util.TokenFilterFactory; -import org.egothor.stemmer.Trie; - -/** - * Factory for {@link StempelFilter} using a Polish stemming table. - */ -public class StempelPolishStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private Trie stemmer = null; - private static final String STEMTABLE = "org/apache/lucene/analysis/pl/stemmer_20000.tbl"; - - public TokenStream create(TokenStream input) { - return new StempelFilter(input, new StempelStemmer(stemmer)); - } - - public void inform(ResourceLoader loader) { - try { - stemmer = StempelStemmer.load(loader.openResource(STEMTABLE)); - } catch (IOException e) { - throw new SolrException(ErrorCode.SERVER_ERROR, "Could not load stem table: " + STEMTABLE); - } - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java (working copy) @@ -1,38 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.cn.smart.WordTokenFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for the SmartChineseAnalyzer {@link WordTokenFilter} - *

- * Note: this class will currently emit tokens for punctuation. So you should either add - * a WordDelimiterFilter after to remove these (with concatenate off), or use the - * SmartChinese stoplist with a StopFilterFactory via: - * words="org/apache/lucene/analysis/cn/smart/stopwords.txt" - * @lucene.experimental - */ -public class SmartChineseWordTokenFilterFactory extends TokenFilterFactory { - public TokenFilter create(TokenStream input) { - return new WordTokenFilter(input); - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.icu.ICUFoldingFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** Factory for {@link ICUFoldingFilter} */ -public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - - @Override - public TokenStream create(TokenStream input) { - return new ICUFoldingFilter(input); - } - - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java (working copy) @@ -1,33 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -/** Factory for {@link ICUTokenizer} */ -public class ICUTokenizerFactory extends TokenizerFactory { - // TODO: add support for custom configs - @Override - public Tokenizer create(Reader input) { - return new ICUTokenizer(input); - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java (working copy) @@ -1,88 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrException.ErrorCode; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import com.ibm.icu.text.FilteredNormalizer2; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.UnicodeSet; - -/** - * Factory for {@link ICUNormalizer2Filter} - *

- * Supports the following attributes: - *

    - *
  • name: A Unicode Normalization Form, - * one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf. - *
  • mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc - * or nfkc, to get nfd or nfkd, respectively. - *
  • filter: A {@link UnicodeSet} pattern. Codepoints outside the set are - * always left unchanged. Default is [] (the null set, no filtering). - *
- * @see ICUNormalizer2Filter - * @see Normalizer2 - * @see FilteredNormalizer2 - */ -public class ICUNormalizer2FilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - private Normalizer2 normalizer; - - // TODO: support custom normalization - @Override - public void init(Map args) { - super.init(args); - String name = args.get("name"); - if (name == null) - name = "nfkc_cf"; - String mode = args.get("mode"); - if (mode == null) - mode = "compose"; - - if (mode.equals("compose")) - normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE); - else if (mode.equals("decompose")) - normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE); - else - throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid mode: " + mode); - - String filter = args.get("filter"); - if (filter != null) { - UnicodeSet set = new UnicodeSet(filter); - if (!set.isEmpty()) { - set.freeze(); - normalizer = new FilteredNormalizer2(normalizer, set); - } - } - } - - public TokenStream create(TokenStream input) { - return new ICUNormalizer2Filter(input, normalizer); - } - - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUFoldingFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.icu.ICUFoldingFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** Factory for {@link ICUFoldingFilter} */ -public class ICUFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - - @Override - public TokenStream create(TokenStream input) { - return new ICUFoldingFilter(input); - } - - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUNormalizer2FilterFactory.java (working copy) @@ -1,88 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrException.ErrorCode; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import com.ibm.icu.text.FilteredNormalizer2; -import com.ibm.icu.text.Normalizer2; -import com.ibm.icu.text.UnicodeSet; - -/** - * Factory for {@link ICUNormalizer2Filter} - *

- * Supports the following attributes: - *

    - *
  • name: A Unicode Normalization Form, - * one of 'nfc','nfkc', 'nfkc_cf'. Default is nfkc_cf. - *
  • mode: Either 'compose' or 'decompose'. Default is compose. Use "decompose" with nfc - * or nfkc, to get nfd or nfkd, respectively. - *
  • filter: A {@link UnicodeSet} pattern. Codepoints outside the set are - * always left unchanged. Default is [] (the null set, no filtering). - *
- * @see ICUNormalizer2Filter - * @see Normalizer2 - * @see FilteredNormalizer2 - */ -public class ICUNormalizer2FilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - private Normalizer2 normalizer; - - // TODO: support custom normalization - @Override - public void init(Map args) { - super.init(args); - String name = args.get("name"); - if (name == null) - name = "nfkc_cf"; - String mode = args.get("mode"); - if (mode == null) - mode = "compose"; - - if (mode.equals("compose")) - normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE); - else if (mode.equals("decompose")) - normalizer = Normalizer2.getInstance(null, name, Normalizer2.Mode.DECOMPOSE); - else - throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid mode: " + mode); - - String filter = args.get("filter"); - if (filter != null) { - UnicodeSet set = new UnicodeSet(filter); - if (!set.isEmpty()) { - set.freeze(); - normalizer = new FilteredNormalizer2(normalizer, set); - } - } - } - - public TokenStream create(TokenStream input) { - return new ICUNormalizer2Filter(input, normalizer); - } - - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTokenizerFactory.java (working copy) @@ -1,33 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.icu.segmentation.ICUTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -/** Factory for {@link ICUTokenizer} */ -public class ICUTokenizerFactory extends TokenizerFactory { - // TODO: add support for custom configs - @Override - public Tokenizer create(Reader input) { - return new ICUTokenizer(input); - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/ICUTransformFilterFactory.java (working copy) @@ -1,74 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.icu.ICUTransformFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrException.ErrorCode; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import com.ibm.icu.text.Transliterator; - -/** - * Factory for {@link ICUTransformFilter}. - *

- * Supports the following attributes: - *

    - *
  • id (mandatory): A Transliterator ID, one from {@link Transliterator#getAvailableIDs()} - *
  • direction (optional): Either 'forward' or 'reverse'. Default is forward. - *
- * @see Transliterator - */ -public class ICUTransformFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - private Transliterator transliterator; - - // TODO: add support for custom rules - @Override - public void init(Map args) { - super.init(args); - String id = args.get("id"); - if (id == null) { - throw new SolrException(ErrorCode.SERVER_ERROR, "id is required."); - } - - int dir; - String direction = args.get("direction"); - if (direction == null || direction.equalsIgnoreCase("forward")) - dir = Transliterator.FORWARD; - else if (direction.equalsIgnoreCase("reverse")) - dir = Transliterator.REVERSE; - else - throw new SolrException(ErrorCode.SERVER_ERROR, "invalid direction: " + direction); - - transliterator = Transliterator.getInstance(id, dir); - } - - public TokenStream create(TokenStream input) { - return new ICUTransformFilter(input, transliterator); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/MorfologikFilterFactory.java (working copy) @@ -1,82 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import java.util.Arrays; -import java.util.Locale; -import java.util.Map; - -import morfologik.stemming.PolishStemmer.DICTIONARY; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.morfologik.MorfologikFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Filter factory for {@link MorfologikFilter}. - *
- * <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.MorfologikFilterFactory" dictionary="MORFOLOGIK" />
- *   </analyzer>
- * </fieldType>
- * - *

Any of Morfologik dictionaries can be used, these are at the moment: - * MORFOLOGIK (Morfologik's original dictionary), - * MORFEUSZ (Morfeusz-SIAT), - * COMBINED (both of the dictionaries above, combined). - * - * @see Morfologik web site - */ -public class MorfologikFilterFactory extends TokenFilterFactory { - /** Dictionary. */ - private DICTIONARY dictionary = DICTIONARY.MORFOLOGIK; - - /** Schema attribute. */ - public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary"; - - /** - * {@inheritDoc} - */ - @Override - public TokenStream create(TokenStream ts) { - return new MorfologikFilter(ts, dictionary, luceneMatchVersion); - } - - /** - * {@inheritDoc} - */ - @Override - public void init(Map args) { - super.init(args); - String dictionaryName = args.get(DICTIONARY_SCHEMA_ATTRIBUTE); - if (dictionaryName != null && !dictionaryName.isEmpty()) { - try { - DICTIONARY dictionary = DICTIONARY.valueOf(dictionaryName.toUpperCase(Locale.ROOT)); - assert dictionary != null; - this.dictionary = dictionary; - } catch (IllegalArgumentException e) { - throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute accepts the " - + "following constants: " + Arrays.toString(DICTIONARY.values()) + ", this value is invalid: " - + dictionaryName); - } - } - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseSentenceTokenizerFactory.java (working copy) @@ -1,34 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.cn.smart.SentenceTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -/** - * Factory for the SmartChineseAnalyzer {@link SentenceTokenizer} - * @lucene.experimental - */ -public class SmartChineseSentenceTokenizerFactory extends TokenizerFactory { - public Tokenizer create(Reader input) { - return new SentenceTokenizer(input); - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/SmartChineseWordTokenFilterFactory.java (working copy) @@ -1,38 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.cn.smart.WordTokenFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for the SmartChineseAnalyzer {@link WordTokenFilter} - *

- * Note: this class will currently emit tokens for punctuation. So you should either add - * a WordDelimiterFilter after to remove these (with concatenate off), or use the - * SmartChinese stoplist with a StopFilterFactory via: - * words="org/apache/lucene/analysis/cn/smart/stopwords.txt" - * @lucene.experimental - */ -public class SmartChineseWordTokenFilterFactory extends TokenFilterFactory { - public TokenFilter create(TokenStream input) { - return new WordTokenFilter(input); - } -} Index: solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java (working copy) @@ -1,50 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.stempel.StempelFilter; -import org.apache.lucene.analysis.stempel.StempelStemmer; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.common.SolrException; -import org.apache.solr.common.SolrException.ErrorCode; -import org.apache.lucene.analysis.util.ResourceLoaderAware; -import org.apache.lucene.analysis.util.TokenFilterFactory; -import org.egothor.stemmer.Trie; - -/** - * Factory for {@link StempelFilter} using a Polish stemming table. - */ -public class StempelPolishStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private Trie stemmer = null; - private static final String STEMTABLE = "org/apache/lucene/analysis/pl/stemmer_20000.tbl"; - - public TokenStream create(TokenStream input) { - return new StempelFilter(input, new StempelStemmer(stemmer)); - } - - public void inform(ResourceLoader loader) { - try { - stemmer = StempelStemmer.load(loader.openResource(STEMTABLE)); - } catch (IOException e) { - throw new SolrException(ErrorCode.SERVER_ERROR, "Could not load stem table: " + STEMTABLE); - } - } -} Index: solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUFoldingFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; - -/** basic tests for {@link ICUFoldingFilterFactory} */ -public class TestICUFoldingFilterFactory extends BaseTokenStreamTestCase { - - /** basic tests to ensure the folding is working */ - public void test() throws Exception { - Reader reader = new StringReader("Résumé"); - ICUFoldingFilterFactory factory = new ICUFoldingFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "resume" }); - } -} Index: solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUNormalizer2FilterFactory.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; - -/** basic tests for {@link ICUNormalizer2FilterFactory} */ -public class TestICUNormalizer2FilterFactory extends BaseTokenStreamTestCase { - - /** Test nfkc_cf defaults */ - public void testDefaults() throws Exception { - Reader reader = new StringReader("This is a Test"); - ICUNormalizer2FilterFactory factory = new ICUNormalizer2FilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "this", "is", "a", "test" }); - } - - // TODO: add tests for different forms -} Index: solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java =================================================================== --- solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTokenizerFactory.java (working copy) @@ -1,36 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; - -/** basic tests for {@link ICUTokenizerFactory} **/ -public class TestICUTokenizerFactory extends BaseTokenStreamTestCase { - public void testMixedText() throws Exception { - Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี This is a test ກວ່າດອກ"); - ICUTokenizerFactory factory = new ICUTokenizerFactory(); - TokenStream stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", - "This", "is", "a", "test", "ກວ່າ", "ດອກ"}); - } -} Index: solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestICUTransformFilterFactory.java (working copy) @@ -1,65 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; - -/** basic tests for {@link ICUTransformFilterFactory} */ -public class TestICUTransformFilterFactory extends BaseTokenStreamTestCase { - - /** ensure the transform is working */ - public void test() throws Exception { - Reader reader = new StringReader("簡化字"); - ICUTransformFilterFactory factory = new ICUTransformFilterFactory(); - Map args = new HashMap(); - args.put("id", "Traditional-Simplified"); - factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "简化字" }); - } - - /** test forward and reverse direction */ - public void testDirection() throws Exception { - // forward - Reader reader = new StringReader("Российская Федерация"); - ICUTransformFilterFactory factory = new ICUTransformFilterFactory(); - Map args = new HashMap(); - args.put("id", "Cyrillic-Latin"); - factory.init(args); - Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Rossijskaâ", "Federaciâ" }); - - // backward (invokes Latin-Cyrillic) - reader = new StringReader("Rossijskaâ Federaciâ"); - args.put("direction", "reverse"); - factory.init(args); - tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader); - stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Российская", "Федерация" }); - } -} Index: solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestMorfologikFilterFactory.java (working copy) @@ -1,44 +0,0 @@ -package org.apache.solr.analysis; - -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Test for {@link MorfologikFilterFactory}. - */ -public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase { - public void testCreateDictionary() throws Exception { - StringReader reader = new StringReader("rowery bilety"); - Map initParams = new HashMap(); - initParams.put(MorfologikFilterFactory.DICTIONARY_SCHEMA_ATTRIBUTE, - "morfologik"); - MorfologikFilterFactory factory = new MorfologikFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(initParams); - TokenStream ts = factory.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, - reader)); - assertTokenStreamContents(ts, new String[] {"rower", "bilet"}); - } -} Index: solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java =================================================================== --- solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java (revision 1365483) +++ solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestSmartChineseFactories.java (working copy) @@ -1,58 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; - -/** - * Tests for {@link SmartChineseSentenceTokenizerFactory} and - * {@link SmartChineseWordTokenFilterFactory} - */ -public class TestSmartChineseFactories extends BaseTokenStreamTestCase { - /** Test showing the behavior with whitespace */ - public void testSimple() throws Exception { - String sentence = "我购买了道具和服装。"; - WhitespaceTokenizer ws = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(sentence)); - SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory(); - TokenStream ts = factory.create(ws); - // TODO: fix smart chinese to not emit punctuation tokens - // at the moment: you have to clean up with WDF, or use the stoplist, etc - assertTokenStreamContents(ts, - new String[] { "我", "购买", "了", "道具", "和", "服装", "," }); - } - - /** Test showing the behavior with whitespace */ - public void testTokenizer() throws Exception { - String sentence = "我购买了道具和服装。我购买了道具和服装。"; - SmartChineseSentenceTokenizerFactory tokenizerFactory = new SmartChineseSentenceTokenizerFactory(); - Tokenizer tokenizer = tokenizerFactory.create(new StringReader(sentence)); - SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory(); - TokenStream ts = factory.create(tokenizer); - // TODO: fix smart chinese to not emit punctuation tokens - // at the moment: you have to clean up with WDF, or use the stoplist, etc - assertTokenStreamContents(ts, - new String[] { "我", "购买", "了", "道具", "和", "服装", ",", - "我", "购买", "了", "道具", "和", "服装", "," - }); - } -} Index: solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestStempelPolishStemFilterFactory.java =================================================================== --- solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestStempelPolishStemFilterFactory.java (revision 1365483) +++ solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestStempelPolishStemFilterFactory.java (working copy) @@ -1,39 +0,0 @@ -package org.apache.solr.analysis; - -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.solr.core.SolrResourceLoader; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Tests for {@link StempelPolishStemFilterFactory} - */ -public class TestStempelPolishStemFilterFactory extends BaseTokenStreamTestCase { - public void testBasics() throws Exception { - StringReader document = new StringReader("studenta studenci"); - StempelPolishStemFilterFactory factory = new StempelPolishStemFilterFactory(); - factory.inform(new SolrResourceLoader(null, null)); - TokenStream ts = factory.create(new WhitespaceTokenizer(TEST_VERSION_CURRENT, document)); - assertTokenStreamContents(ts, - new String[] { "student", "student" }); - } -} Index: solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java =================================================================== --- solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java (revision 1365483) +++ solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/SolrStopwordsCarrot2LexicalDataFactory.java (working copy) @@ -23,8 +23,8 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.TokenFilterFactory; -import org.apache.solr.analysis.CommonGramsFilterFactory; -import org.apache.solr.analysis.StopFilterFactory; +import org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory; +import org.apache.lucene.analysis.core.StopFilterFactory; import org.apache.solr.analysis.TokenizerChain; import org.apache.solr.schema.IndexSchema; import org.carrot2.core.LanguageCode; Index: solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMAAnnotationsTokenizerFactory.java =================================================================== --- solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMAAnnotationsTokenizerFactory.java (revision 1365483) +++ solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMAAnnotationsTokenizerFactory.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.solr.uima.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; -import org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizer; - -import java.io.Reader; -import java.util.Map; - -/** - * {@link org.apache.lucene.analysis.util.TokenizerFactory} for {@link UIMAAnnotationsTokenizer} - */ -public class UIMAAnnotationsTokenizerFactory extends TokenizerFactory { - - private String descriptorPath; - private String tokenType; - - @Override - public void init(Map args) { - super.init(args); - descriptorPath = args.get("descriptorPath"); - tokenType = args.get("tokenType"); - } - - @Override - public Tokenizer create(Reader input) { - return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, input); - } -} Index: solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMATypeAwareAnnotationsTokenizerFactory.java =================================================================== --- solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMATypeAwareAnnotationsTokenizerFactory.java (revision 1365483) +++ solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMATypeAwareAnnotationsTokenizerFactory.java (working copy) @@ -1,48 +0,0 @@ -package org.apache.solr.uima.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.uima.UIMATypeAwareAnnotationsTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; -import java.util.Map; - -/** - * {@link org.apache.lucene.analysis.util.TokenizerFactory} for {@link UIMATypeAwareAnnotationsTokenizer} - */ -public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory { - - private String descriptorPath; - private String tokenType; - private String featurePath; - - @Override - public void init(Map args) { - super.init(args); - descriptorPath = args.get("descriptorPath"); - tokenType = args.get("tokenType"); - featurePath = args.get("featurePath"); - } - - @Override - public Tokenizer create(Reader input) { - return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, input); - } -} Index: solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMAAnnotationsTokenizerFactory.java =================================================================== --- solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMAAnnotationsTokenizerFactory.java (revision 1365483) +++ solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMAAnnotationsTokenizerFactory.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.solr.uima.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; -import org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizer; - -import java.io.Reader; -import java.util.Map; - -/** - * {@link org.apache.lucene.analysis.util.TokenizerFactory} for {@link UIMAAnnotationsTokenizer} - */ -public class UIMAAnnotationsTokenizerFactory extends TokenizerFactory { - - private String descriptorPath; - private String tokenType; - - @Override - public void init(Map args) { - super.init(args); - descriptorPath = args.get("descriptorPath"); - tokenType = args.get("tokenType"); - } - - @Override - public Tokenizer create(Reader input) { - return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, input); - } -} Index: solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMATypeAwareAnnotationsTokenizerFactory.java =================================================================== --- solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMATypeAwareAnnotationsTokenizerFactory.java (revision 1365483) +++ solr/contrib/uima/src/java/org/apache/solr/uima/analysis/UIMATypeAwareAnnotationsTokenizerFactory.java (working copy) @@ -1,48 +0,0 @@ -package org.apache.solr.uima.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.uima.UIMATypeAwareAnnotationsTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; -import java.util.Map; - -/** - * {@link org.apache.lucene.analysis.util.TokenizerFactory} for {@link UIMATypeAwareAnnotationsTokenizer} - */ -public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory { - - private String descriptorPath; - private String tokenType; - private String featurePath; - - @Override - public void init(Map args) { - super.init(args); - descriptorPath = args.get("descriptorPath"); - tokenType = args.get("tokenType"); - featurePath = args.get("featurePath"); - } - - @Override - public Tokenizer create(Reader input) { - return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, input); - } -} Index: solr/contrib/uima/src/test-files/uima/uima-tokenizers-schema.xml =================================================================== --- solr/contrib/uima/src/test-files/uima/uima-tokenizers-schema.xml (revision 1365483) +++ solr/contrib/uima/src/test-files/uima/uima-tokenizers-schema.xml (working copy) @@ -299,14 +299,14 @@ - - Index: solr/core =================================================================== --- solr/core (revision 1365483) +++ solr/core (working copy) Property changes on: solr/core ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/solr/core:r1364862-1365496 Index: solr/core/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java (working copy) @@ -1,47 +0,0 @@ -package org.apache.solr.analysis; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - - -/** - * Factory for {@link ArabicNormalizationFilter}. - *

- * <fieldType name="text_arnormal" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.ArabicNormalizationFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class ArabicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - - public ArabicNormalizationFilter create(TokenStream input) { - return new ArabicNormalizationFilter(input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java (working copy) @@ -1,42 +0,0 @@ -package org.apache.solr.analysis; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ar.ArabicStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - - -/** - * Factory for {@link ArabicStemFilter}. - *
- * <fieldType name="text_arstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.ArabicNormalizationFilterFactory"/>
- *     <filter class="solr.ArabicStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class ArabicStemFilterFactory extends TokenFilterFactory { - - - public ArabicStemFilter create(TokenStream input) { - return new ArabicStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java (working copy) @@ -1,49 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; -import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; -import org.apache.lucene.analysis.TokenStream; - -/** - * Factory for {@link ASCIIFoldingFilter}. - *
- * <fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.ASCIIFoldingFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class ASCIIFoldingFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - public ASCIIFoldingFilter create(TokenStream input) { - return new ASCIIFoldingFilter(input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} - Index: solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java (working copy) @@ -1,77 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Arrays; -import java.util.HashSet; -import java.util.Map; - -import org.apache.commons.codec.language.bm.Languages.LanguageSet; -import org.apache.commons.codec.language.bm.NameType; -import org.apache.commons.codec.language.bm.PhoneticEngine; -import org.apache.commons.codec.language.bm.RuleType; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.phonetic.BeiderMorseFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link BeiderMorseFilter}. - *
- * <fieldType name="text_bm" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.BeiderMorseFilterFactory"
- *        nameType="GENERIC" ruleType="APPROX" 
- *        concat="true" languageSet="auto"
- *     </filter>
- *   </analyzer>
- * </fieldType>
- * - */ -public class BeiderMorseFilterFactory extends TokenFilterFactory { - private PhoneticEngine engine; - private LanguageSet languageSet; - - public void init(Map args) { - super.init(args); - - // PhoneticEngine = NameType + RuleType + concat - // we use common-codec's defaults: GENERIC + APPROX + true - String nameTypeArg = args.get("nameType"); - NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg); - - String ruleTypeArg = args.get("ruleType"); - RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg); - - boolean concat = getBoolean("concat", true); - engine = new PhoneticEngine(nameType, ruleType, concat); - - // LanguageSet: defaults to automagic, otherwise a comma-separated list. - String languageSetArg = args.get("languageSet"); - if (languageSetArg == null || languageSetArg.equals("auto")) { - languageSet = null; - } else { - languageSet = LanguageSet.from(new HashSet(Arrays.asList(languageSetArg.split(",")))); - } - } - - @Override - public TokenStream create(TokenStream input) { - return new BeiderMorseFilter(input, engine, languageSet); - } -} Index: solr/core/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java (working copy) @@ -1,43 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.br.BrazilianStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link BrazilianStemFilter}. - *
- * <fieldType name="text_brstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.BrazilianStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class BrazilianStemFilterFactory extends TokenFilterFactory { - public BrazilianStemFilter create(TokenStream in) { - return new BrazilianStemFilter(in); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.bg.BulgarianStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link BulgarianStemFilter}. - *
- * <fieldType name="text_bgstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.BulgarianStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class BulgarianStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new BulgarianStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java (working copy) @@ -1,140 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter; -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Map; -import java.util.StringTokenizer; - -/** - * Factory for {@link CapitalizationFilter}. - *

- * The factory takes parameters:
- * "onlyFirstWord" - should each word be capitalized or all of the words?
- * "keep" - a keep word list. Each word that should be kept separated by whitespace.
- * "keepIgnoreCase - true or false. If true, the keep list will be considered case-insensitive.
- * "forceFirstLetter" - Force the first letter to be capitalized even if it is in the keep list
- * "okPrefix" - do not change word capitalization if a word begins with something in this list. - * for example if "McK" is on the okPrefix list, the word "McKinley" should not be changed to - * "Mckinley"
- * "minWordLength" - how long the word needs to be to get capitalization applied. If the - * minWordLength is 3, "and" > "And" but "or" stays "or"
- * "maxWordCount" - if the token contains more then maxWordCount words, the capitalization is - * assumed to be correct.
- * - *

- * <fieldType name="text_cptlztn" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.CapitalizationFilterFactory" onlyFirstWord="true"
- *     	     keep="java solr lucene" keepIgnoreCase="false"
- *     	     okPrefix="McK McD McA"/>   
- *   </analyzer>
- * </fieldType>
- * - * - * @since solr 1.3 - */ -public class CapitalizationFilterFactory extends TokenFilterFactory { - public static final String KEEP = "keep"; - public static final String KEEP_IGNORE_CASE = "keepIgnoreCase"; - public static final String OK_PREFIX = "okPrefix"; - public static final String MIN_WORD_LENGTH = "minWordLength"; - public static final String MAX_WORD_COUNT = "maxWordCount"; - public static final String MAX_TOKEN_LENGTH = "maxTokenLength"; - public static final String ONLY_FIRST_WORD = "onlyFirstWord"; - public static final String FORCE_FIRST_LETTER = "forceFirstLetter"; - - //Map keep = new HashMap(); // not synchronized because it is only initialized once - CharArraySet keep; - - Collection okPrefix = Collections.emptyList(); // for Example: McK - - int minWordLength = 0; // don't modify capitalization for words shorter then this - int maxWordCount = CapitalizationFilter.DEFAULT_MAX_WORD_COUNT; - int maxTokenLength = CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH; - boolean onlyFirstWord = true; - boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is in the keep list - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - - String k = args.get(KEEP); - if (k != null) { - StringTokenizer st = new StringTokenizer(k); - boolean ignoreCase = false; - String ignoreStr = args.get(KEEP_IGNORE_CASE); - if ("true".equalsIgnoreCase(ignoreStr)) { - ignoreCase = true; - } - keep = new CharArraySet(luceneMatchVersion, 10, ignoreCase); - while (st.hasMoreTokens()) { - k = st.nextToken().trim(); - keep.add(k.toCharArray()); - } - } - - k = args.get(OK_PREFIX); - if (k != null) { - okPrefix = new ArrayList(); - StringTokenizer st = new StringTokenizer(k); - while (st.hasMoreTokens()) { - okPrefix.add(st.nextToken().trim().toCharArray()); - } - } - - k = args.get(MIN_WORD_LENGTH); - if (k != null) { - minWordLength = Integer.valueOf(k); - } - - k = args.get(MAX_WORD_COUNT); - if (k != null) { - maxWordCount = Integer.valueOf(k); - } - - k = args.get(MAX_TOKEN_LENGTH); - if (k != null) { - maxTokenLength = Integer.valueOf(k); - } - - k = args.get(ONLY_FIRST_WORD); - if (k != null) { - onlyFirstWord = Boolean.valueOf(k); - } - - k = args.get(FORCE_FIRST_LETTER); - if (k != null) { - forceFirstLetter = Boolean.valueOf(k); - } - } - - public CapitalizationFilter create(TokenStream input) { - return new CapitalizationFilter(input, onlyFirstWord, keep, - forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength); - } -} Index: solr/core/src/java/org/apache/solr/analysis/CJKBigramFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/CJKBigramFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/CJKBigramFilterFactory.java (working copy) @@ -1,65 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.cjk.CJKBigramFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link CJKBigramFilter}. - *
- * <fieldType name="text_cjk" class="solr.TextField">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.CJKWidthFilterFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.CJKBigramFilterFactory" 
- *       han="true" hiragana="true" 
- *       katakana="true" hangul="true" />
- *   </analyzer>
- * </fieldType>
- */ -public class CJKBigramFilterFactory extends TokenFilterFactory { - int flags; - - @Override - public void init(Map args) { - super.init(args); - flags = 0; - if (getBoolean("han", true)) { - flags |= CJKBigramFilter.HAN; - } - if (getBoolean("hiragana", true)) { - flags |= CJKBigramFilter.HIRAGANA; - } - if (getBoolean("katakana", true)) { - flags |= CJKBigramFilter.KATAKANA; - } - if (getBoolean("hangul", true)) { - flags |= CJKBigramFilter.HANGUL; - } - } - - @Override - public TokenStream create(TokenStream input) { - return new CJKBigramFilter(input, flags); - } -} Index: solr/core/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java (working copy) @@ -1,50 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.cjk.CJKWidthFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link CJKWidthFilter}. - *
- * <fieldType name="text_cjk" class="solr.TextField">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.CJKWidthFilterFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.CJKBigramFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- */ - -public class CJKWidthFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - - @Override - public TokenStream create(TokenStream input) { - return new CJKWidthFilter(input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/ClassicFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/ClassicFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/ClassicFilterFactory.java (working copy) @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.TokenFilterFactory; -import org.apache.lucene.analysis.standard.ClassicFilter; - -/** - * Factory for {@link ClassicFilter}. - *
- * <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.ClassicTokenizerFactory"/>
- *     <filter class="solr.ClassicFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - * - */ -public class ClassicFilterFactory extends TokenFilterFactory { - public TokenFilter create(TokenStream input) { - return new ClassicFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java (working copy) @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.standard.ClassicTokenizer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; -import java.util.Map; - -/** - * Factory for {@link ClassicTokenizer}. - *
- * <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.ClassicTokenizerFactory" maxTokenLength="120"/>
- *   </analyzer>
- * </fieldType>
- * - * - */ - -public class ClassicTokenizerFactory extends TokenizerFactory { - - private int maxTokenLength; - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - maxTokenLength = getInt("maxTokenLength", - StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); - } - - public Tokenizer create(Reader input) { - ClassicTokenizer tokenizer = new ClassicTokenizer(luceneMatchVersion, input); - tokenizer.setMaxTokenLength(maxTokenLength); - return tokenizer; - } -} Index: solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java (working copy) @@ -1,82 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.analysis; - -import java.io.IOException; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.commongrams.CommonGramsFilter; -import org.apache.lucene.analysis.core.StopAnalyzer; -import org.apache.lucene.analysis.util.*; - -/** - * Constructs a {@link CommonGramsFilter}. - *
- * <fieldType name="text_cmmngrms" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.CommonGramsFilterFactory" words="commongramsstopwords.txt" ignoreCase="false"/>
- *   </analyzer>
- * </fieldType>
- * - */ - -/* - * This is pretty close to a straight copy from StopFilterFactory - */ -public class CommonGramsFilterFactory extends TokenFilterFactory implements - ResourceLoaderAware { - - public void inform(ResourceLoader loader) { - String commonWordFiles = args.get("words"); - ignoreCase = getBoolean("ignoreCase", false); - - if (commonWordFiles != null) { - try { - if ("snowball".equalsIgnoreCase(args.get("format"))) { - commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase); - } else { - commonWords = getWordSet(loader, commonWordFiles, ignoreCase); - } - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading common word file", e); - } - } else { - commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; - } - } - - //Force the use of a char array set, as it is the most performant, although this may break things if Lucene ever goes away from it. See SOLR-1095 - private CharArraySet commonWords; - private boolean ignoreCase; - - public boolean isIgnoreCase() { - return ignoreCase; - } - - public CharArraySet getCommonWords() { - return commonWords; - } - - public CommonGramsFilter create(TokenStream input) { - CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords); - return commonGrams; - } -} - - - \ No newline at end of file Index: solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java (working copy) @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.analysis; - -import java.io.IOException; -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.commongrams.CommonGramsFilter; -import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter; -import org.apache.lucene.analysis.core.StopAnalyzer; -import org.apache.lucene.analysis.util.*; - -/** - * Construct {@link CommonGramsQueryFilter}. - * - * This is pretty close to a straight copy from {@link StopFilterFactory}. - * - *
- * <fieldType name="text_cmmngrmsqry" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.CommonGramsQueryFilterFactory" words="commongramsquerystopwords.txt" ignoreCase="false"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class CommonGramsQueryFilterFactory extends TokenFilterFactory - implements ResourceLoaderAware { - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - } - - public void inform(ResourceLoader loader) { - String commonWordFiles = args.get("words"); - ignoreCase = getBoolean("ignoreCase", false); - - if (commonWordFiles != null) { - try { - if ("snowball".equalsIgnoreCase(args.get("format"))) { - commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase); - } else { - commonWords = getWordSet(loader, commonWordFiles, ignoreCase); - } - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading common word file", e); - } - } else { - commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; - } - } - - // Force the use of a char array set, as it is the most performant, although - // this may break things if Lucene ever goes away from it. See SOLR-1095 - private CharArraySet commonWords; - - private boolean ignoreCase; - - public boolean isIgnoreCase() { - return ignoreCase; - } - - public CharArraySet getCommonWords() { - return commonWords; - } - - /** - * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter - */ - public CommonGramsQueryFilter create(TokenStream input) { - CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords); - CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter( - commonGrams); - return commonGramsQuery; - } -} Index: solr/core/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java (working copy) @@ -1,39 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.cz.CzechStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link CzechStemFilter}. - *
- * <fieldType name="text_czstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.CzechStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- */ -public class CzechStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new CzechStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java (working copy) @@ -1,83 +0,0 @@ -package org.apache.solr.analysis; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; -import org.apache.lucene.analysis.payloads.PayloadEncoder; -import org.apache.lucene.analysis.payloads.FloatEncoder; -import org.apache.lucene.analysis.payloads.IntegerEncoder; -import org.apache.lucene.analysis.payloads.IdentityEncoder; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.lucene.analysis.util.ResourceLoaderAware; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import java.util.Map; - - -/** - * - * Factory for {@link DelimitedPayloadTokenFilter}. - *
- * <fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float" delimiter="|"/>
- *   </analyzer>
- * </fieldType>
- * - * - */ -public class DelimitedPayloadTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - public static final String ENCODER_ATTR = "encoder"; - public static final String DELIMITER_ATTR = "delimiter"; - - private PayloadEncoder encoder; - private char delimiter = '|'; - - public DelimitedPayloadTokenFilter create(TokenStream input) { - return new DelimitedPayloadTokenFilter(input, delimiter, encoder); - } - - @Override - public void init(Map args) { - super.init(args); - } - - public void inform(ResourceLoader loader) { - String encoderClass = args.get(ENCODER_ATTR); - if (encoderClass.equals("float")){ - encoder = new FloatEncoder(); - } else if (encoderClass.equals("integer")){ - encoder = new IntegerEncoder(); - } else if (encoderClass.equals("identity")){ - encoder = new IdentityEncoder(); - } else { - encoder = loader.newInstance(encoderClass, PayloadEncoder.class); - } - - String delim = args.get(DELIMITER_ATTR); - if (delim != null){ - if (delim.length() == 1) { - delimiter = delim.charAt(0); - } else{ - throw new InitializationException("Delimiter must be one character only"); - } - } - } -} \ No newline at end of file Index: solr/core/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java (working copy) @@ -1,72 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; -import org.apache.lucene.analysis.compound.*; -import org.apache.lucene.analysis.util.*; -import org.apache.lucene.analysis.TokenStream; - -import java.util.Map; -import java.io.IOException; - -/** - * Factory for {@link DictionaryCompoundWordTokenFilter}. - *
- * <fieldType name="text_dictcomp" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.DictionaryCompoundWordTokenFilterFactory" dictionary="dictionary.txt"
- *     	     minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="true"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class DictionaryCompoundWordTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private CharArraySet dictionary; - private String dictFile; - private int minWordSize; - private int minSubwordSize; - private int maxSubwordSize; - private boolean onlyLongestMatch; - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - dictFile = args.get("dictionary"); - if (null == dictFile) { - throw new InitializationException("Missing required parameter: dictionary"); - } - - minWordSize= getInt("minWordSize",CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE); - minSubwordSize= getInt("minSubwordSize",CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE); - maxSubwordSize= getInt("maxSubwordSize",CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE); - onlyLongestMatch = getBoolean("onlyLongestMatch",true); - } - public void inform(ResourceLoader loader) { - try { - dictionary = super.getWordSet(loader, dictFile, false); - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading dictionary", e); - } - } - public DictionaryCompoundWordTokenFilter create(TokenStream input) { - return new DictionaryCompoundWordTokenFilter(luceneMatchVersion,input,dictionary,minWordSize,minSubwordSize,maxSubwordSize,onlyLongestMatch); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java (working copy) @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.analysis; - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link DoubleMetaphoneFilter}. - *
- * <fieldType name="text_dblmtphn" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.DoubleMetaphoneFilterFactory" inject="true" maxCodeLength="4"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class DoubleMetaphoneFilterFactory extends TokenFilterFactory -{ - public static final String INJECT = "inject"; - public static final String MAX_CODE_LENGTH = "maxCodeLength"; - - public static final int DEFAULT_MAX_CODE_LENGTH = 4; - - private boolean inject = true; - private int maxCodeLength = DEFAULT_MAX_CODE_LENGTH; - - @Override - public void init(Map args) { - super.init(args); - - inject = getBoolean(INJECT, true); - - if (args.get(MAX_CODE_LENGTH) != null) { - maxCodeLength = Integer.parseInt(args.get(MAX_CODE_LENGTH)); - } - } - - public DoubleMetaphoneFilter create(TokenStream input) { - return new DoubleMetaphoneFilter(input, maxCodeLength, inject); - } -} Index: solr/core/src/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java (working copy) @@ -1,63 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Map; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Creates new instances of {@link EdgeNGramTokenFilter}. - *
- * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.EdgeNGramFilterFactory" side="front" minGramSize="1" maxGramSize="1"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class EdgeNGramFilterFactory extends TokenFilterFactory { - private int maxGramSize = 0; - - private int minGramSize = 0; - - private String side; - - @Override - public void init(Map args) { - super.init(args); - String maxArg = args.get("maxGramSize"); - maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) - : EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); - - String minArg = args.get("minGramSize"); - minGramSize = (minArg != null ? Integer.parseInt(minArg) - : EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE); - - side = args.get("side"); - if (side == null) { - side = EdgeNGramTokenFilter.Side.FRONT.getLabel(); - } - } - - public EdgeNGramTokenFilter create(TokenStream input) { - return new EdgeNGramTokenFilter(input, side, minGramSize, maxGramSize); - } -} Index: solr/core/src/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java (working copy) @@ -1,61 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; -import java.util.Map; - -/** - * Creates new instances of {@link EdgeNGramTokenizer}. - *
- * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.EdgeNGramTokenizerFactory" side="front" minGramSize="1" maxGramSize="1"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class EdgeNGramTokenizerFactory extends TokenizerFactory { - private int maxGramSize = 0; - - private int minGramSize = 0; - - private String side; - - @Override - public void init(Map args) { - super.init(args); - String maxArg = args.get("maxGramSize"); - maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); - - String minArg = args.get("minGramSize"); - minGramSize = (minArg != null ? Integer.parseInt(minArg) : EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE); - - side = args.get("side"); - if (side == null) { - side = EdgeNGramTokenizer.Side.FRONT.getLabel(); - } - } - - public EdgeNGramTokenizer create(Reader input) { - return new EdgeNGramTokenizer(input, side, minGramSize, maxGramSize); - } -} Index: solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/ElisionFilterFactory.java (working copy) @@ -1,64 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.util.*; -import org.apache.lucene.analysis.fr.*; - -import java.io.IOException; -import org.apache.lucene.analysis.TokenStream; - -/** - * Factory for {@link ElisionFilter}. - *
- * <fieldType name="text_elsn" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.ElisionFilterFactory" 
- *       articles="stopwordarticles.txt" ignoreCase="true"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class ElisionFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - - private CharArraySet articles; - - public void inform(ResourceLoader loader) { - String articlesFile = args.get("articles"); - boolean ignoreCase = getBoolean("ignoreCase", false); - - if (articlesFile != null) { - try { - articles = getWordSet(loader, articlesFile, ignoreCase); - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading articles", e); - } - } - } - - public ElisionFilter create(TokenStream input) { - assureMatchVersion(); - return articles == null ? new ElisionFilter(luceneMatchVersion,input) : - new ElisionFilter(luceneMatchVersion,input,articles); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/EnglishMinimalStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/EnglishMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/EnglishMinimalStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link EnglishMinimalStemFilter}. - *
- * <fieldType name="text_enminstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.EnglishMinimalStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class EnglishMinimalStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new EnglishMinimalStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/EnglishPossessiveFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/EnglishPossessiveFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/EnglishPossessiveFilterFactory.java (working copy) @@ -1,49 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.en.EnglishPossessiveFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link EnglishPossessiveFilter}. - *
- * <fieldType name="text_enpossessive" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.EnglishPossessiveFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class EnglishPossessiveFilterFactory extends TokenFilterFactory { - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - } - - public TokenStream create(TokenStream input) { - return new EnglishPossessiveFilter(luceneMatchVersion, input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/FinnishLightStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/FinnishLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/FinnishLightStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.fi.FinnishLightStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link FinnishLightStemFilter}. - *
- * <fieldType name="text_filgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.FinnishLightStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class FinnishLightStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new FinnishLightStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/FrenchLightStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/FrenchLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/FrenchLightStemFilterFactory.java (working copy) @@ -1,41 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.fr.FrenchLightStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link FrenchLightStemFilter}. - *
- * <fieldType name="text_frlgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.ElisionFilterFactory"/>
- *     <filter class="solr.FrenchLightStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class FrenchLightStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new FrenchLightStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/FrenchMinimalStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/FrenchMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/FrenchMinimalStemFilterFactory.java (working copy) @@ -1,41 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link FrenchMinimalStemFilter}. - *
- * <fieldType name="text_frminstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.ElisionFilterFactory"/>
- *     <filter class="solr.FrenchMinimalStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class FrenchMinimalStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new FrenchMinimalStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/GalicianMinimalStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.gl.GalicianMinimalStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link GalicianMinimalStemFilter}. - *
- * <fieldType name="text_glplural" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.GalicianMinimalStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class GalicianMinimalStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new GalicianMinimalStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.gl.GalicianStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link GalicianStemFilter}. - *
- * <fieldType name="text_glstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.GalicianStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class GalicianStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new GalicianStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/GermanLightStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/GermanLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/GermanLightStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.de.GermanLightStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link GermanLightStemFilter}. - *
- * <fieldType name="text_delgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.GermanLightStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class GermanLightStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new GermanLightStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/GermanMinimalStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/GermanMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/GermanMinimalStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.de.GermanMinimalStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link GermanMinimalStemFilter}. - *
- * <fieldType name="text_deminstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.GermanMinimalStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class GermanMinimalStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new GermanMinimalStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/GermanNormalizationFilterFactory.java (working copy) @@ -1,47 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.de.GermanNormalizationFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link GermanNormalizationFilter}. - *
- * <fieldType name="text_denorm" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.GermanNormalizationFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- */ -public class GermanNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - - public TokenStream create(TokenStream input) { - return new GermanNormalizationFilter(input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java (working copy) @@ -1,43 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.de.GermanStemFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link GermanStemFilter}. - *
- * <fieldType name="text_destem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.GermanStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class GermanStemFilterFactory extends TokenFilterFactory { - public GermanStemFilter create(TokenStream in) { - return new GermanStemFilter(in); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java (working copy) @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.el.GreekLowerCaseFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link GreekLowerCaseFilter}. - *
- * <fieldType name="text_glc" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.GreekLowerCaseFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class GreekLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - if (args.containsKey("charset")) - throw new InitializationException( - "The charset parameter is no longer supported. " - + "Please process your documents as Unicode instead."); - } - - public GreekLowerCaseFilter create(TokenStream in) { - return new GreekLowerCaseFilter(luceneMatchVersion, in); - } - - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} - Index: solr/core/src/java/org/apache/solr/analysis/GreekStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/GreekStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/GreekStemFilterFactory.java (working copy) @@ -1,42 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.el.GreekStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link GreekStemFilter}. - *
- * <fieldType name="text_gstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.GreekLowerCaseFilterFactory"/>
- *     <filter class="solr.GreekStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class GreekStemFilterFactory extends TokenFilterFactory { - - public TokenStream create(TokenStream input) { - return new GreekStemFilter(input); - } - -} Index: solr/core/src/java/org/apache/solr/analysis/HindiNormalizationFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/HindiNormalizationFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/HindiNormalizationFilterFactory.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.hi.HindiNormalizationFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link HindiNormalizationFilter}. - *
- * <fieldType name="text_hinormal" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.HindiNormalizationFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class HindiNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - public TokenStream create(TokenStream input) { - return new HindiNormalizationFilter(input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java (working copy) @@ -1,39 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.hi.HindiStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link HindiStemFilter}. - *
- * <fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.HindiStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class HindiStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new HindiStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java (working copy) @@ -1,71 +0,0 @@ -package org.apache.solr.analysis; - - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; -import org.apache.lucene.analysis.util.CharFilterFactory; - -import java.io.Reader; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** -* Factory for {@link HTMLStripCharFilter}. - *
- * <fieldType name="text_html" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title" />
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ - public class HTMLStripCharFilterFactory extends CharFilterFactory { - - Set escapedTags = null; - Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+"); - - public HTMLStripCharFilter create(Reader input) { - HTMLStripCharFilter charFilter; - if (null == escapedTags) { - charFilter = new HTMLStripCharFilter(input); - } else { - charFilter = new HTMLStripCharFilter(input, escapedTags); - } - return charFilter; - } - - @Override - public void init(Map args) { - super.init(args); - String escapedTagsArg = args.get("escapedTags"); - if (null != escapedTagsArg) { - Matcher matcher = TAG_NAME_PATTERN.matcher(escapedTagsArg); - while (matcher.find()) { - if (null == escapedTags) { - escapedTags = new HashSet(); - } - escapedTags.add(matcher.group(0)); - } - } - } -} Index: solr/core/src/java/org/apache/solr/analysis/HungarianLightStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/HungarianLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/HungarianLightStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.hu.HungarianLightStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link HungarianLightStemFilter}. - *
- * <fieldType name="text_hulgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.HungarianLightStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class HungarianLightStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new HungarianLightStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java (working copy) @@ -1,117 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.hunspell.HunspellDictionary; -import org.apache.lucene.analysis.hunspell.HunspellStemFilter; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.lucene.analysis.util.ResourceLoaderAware; -import org.apache.lucene.analysis.util.TokenFilterFactory; -import org.apache.lucene.util.IOUtils; - -/** - * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}. - * Example config for British English including a custom dictionary, case insensitive matching: - *
- * <filter class="solr.HunspellStemFilterFactory"
- *    dictionary="en_GB.dic,my_custom.dic"
- *    affix="en_GB.aff"
- *    ignoreCase="true" />
- * Both parameters dictionary and affix are mandatory. - *
- * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false. - *
- * The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true. - * If strict an error while reading an affix rule causes a ParseException, otherwise is ignored. - *
- * Dictionaries for many languages are available through the OpenOffice project. - * - * See http://wiki.apache.org/solr/Hunspell - */ -public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - - private static final String PARAM_DICTIONARY = "dictionary"; - private static final String PARAM_AFFIX = "affix"; - private static final String PARAM_IGNORE_CASE = "ignoreCase"; - private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing"; - private static final String TRUE = "true"; - private static final String FALSE = "false"; - - private HunspellDictionary dictionary; - private boolean ignoreCase = false; - - /** - * Loads the hunspell dictionary and affix files defined in the configuration - * - * @param loader ResourceLoader used to load the files - */ - public void inform(ResourceLoader loader) { - assureMatchVersion(); - String dictionaryFiles[] = args.get(PARAM_DICTIONARY).split(","); - String affixFile = args.get(PARAM_AFFIX); - String pic = args.get(PARAM_IGNORE_CASE); - if(pic != null) { - if(pic.equalsIgnoreCase(TRUE)) ignoreCase = true; - else if(pic.equalsIgnoreCase(FALSE)) ignoreCase = false; - else throw new InitializationException("Unknown value for " + PARAM_IGNORE_CASE + ": " + pic + ". Must be true or false"); - } - - String strictAffixParsingParam = args.get(PARAM_STRICT_AFFIX_PARSING); - boolean strictAffixParsing = true; - if(strictAffixParsingParam != null) { - if(strictAffixParsingParam.equalsIgnoreCase(FALSE)) strictAffixParsing = false; - else if(strictAffixParsingParam.equalsIgnoreCase(TRUE)) strictAffixParsing = true; - else throw new InitializationException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING + ": " + strictAffixParsingParam + ". Must be true or false"); - } - - InputStream affix = null; - List dictionaries = new ArrayList(); - - try { - dictionaries = new ArrayList(); - for (String file : dictionaryFiles) { - dictionaries.add(loader.openResource(file)); - } - affix = loader.openResource(affixFile); - - this.dictionary = new HunspellDictionary(affix, dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing); - } catch (Exception e) { - throw new InitializationException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e); - } finally { - IOUtils.closeWhileHandlingException(affix); - IOUtils.closeWhileHandlingException(dictionaries); - } - } - - /** - * Creates an instance of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter} that will filter the given - * TokenStream - * - * @param tokenStream TokenStream that will be filtered - * @return HunspellStemFilter that filters the TokenStream - */ - public TokenStream create(TokenStream tokenStream) { - return new HunspellStemFilter(tokenStream, dictionary); - } -} Index: solr/core/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java (working copy) @@ -1,39 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link HyphenatedWordsFilter}. - *
- * <fieldType name="text_hyphn" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.HyphenatedWordsFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class HyphenatedWordsFilterFactory extends TokenFilterFactory { - public HyphenatedWordsFilter create(TokenStream input) { - return new HyphenatedWordsFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java (working copy) @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.commons.io.IOUtils; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase; -import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; -import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; -import org.apache.lucene.analysis.util.*; - -import java.util.Map; -import java.io.InputStream; -import org.xml.sax.InputSource; - -/** - * Factory for {@link HyphenationCompoundWordTokenFilter}. - *

- * This factory accepts the following parameters: - *

    - *
  • hyphenator (mandatory): path to the FOP xml hyphenation pattern. - * See http://offo.sourceforge.net/hyphenation/. - *
  • encoding (optional): encoding of the xml hyphenation file. defaults to UTF-8. - *
  • dictionary (optional): dictionary of words. defaults to no dictionary. - *
  • minWordSize (optional): minimal word length that gets decomposed. defaults to 5. - *
  • minSubwordSize (optional): minimum length of subwords. defaults to 2. - *
  • maxSubwordSize (optional): maximum length of subwords. defaults to 15. - *
  • onlyLongestMatch (optional): if true, adds only the longest matching subword - * to the stream. defaults to false. - *
- *

- *

- * <fieldType name="text_hyphncomp" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.HyphenationCompoundWordTokenFilterFactory" hyphenator="hyphenator.xml" encoding="UTF-8"
- *     	     dictionary="dictionary.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="false"/>
- *   </analyzer>
- * </fieldType>
- * - * @see HyphenationCompoundWordTokenFilter - */ -public class HyphenationCompoundWordTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private CharArraySet dictionary; - private HyphenationTree hyphenator; - private String dictFile; - private String hypFile; - private String encoding; - private int minWordSize; - private int minSubwordSize; - private int maxSubwordSize; - private boolean onlyLongestMatch; - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - dictFile = args.get("dictionary"); - if (args.containsKey("encoding")) - encoding = args.get("encoding"); - hypFile = args.get("hyphenator"); - if (null == hypFile) { - throw new InitializationException("Missing required parameter: hyphenator"); - } - - minWordSize = getInt("minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE); - minSubwordSize = getInt("minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE); - maxSubwordSize = getInt("maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE); - onlyLongestMatch = getBoolean("onlyLongestMatch", false); - } - - public void inform(ResourceLoader loader) { - InputStream stream = null; - try { - if (dictFile != null) // the dictionary can be empty. - dictionary = getWordSet(loader, dictFile, false); - // TODO: Broken, because we cannot resolve real system id - // ResourceLoader should also supply method like ClassLoader to get resource URL - stream = loader.openResource(hypFile); - final InputSource is = new InputSource(stream); - is.setEncoding(encoding); // if it's null let xml parser decide - is.setSystemId(hypFile); - hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is); - } catch (Exception e) { // TODO: getHyphenationTree really shouldn't throw "Exception" - throw new InitializationException("Exception thrown while loading dictionary and hyphenation file", e); - } finally { - IOUtils.closeQuietly(stream); - } - } - - public HyphenationCompoundWordTokenFilter create(TokenStream input) { - return new HyphenationCompoundWordTokenFilter(luceneMatchVersion, input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); - } -} Index: solr/core/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.in.IndicNormalizationFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link IndicNormalizationFilter}. - *
- * <fieldType name="text_innormal" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.IndicNormalizationFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class IndicNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - public TokenStream create(TokenStream input) { - return new IndicNormalizationFilter(input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java (working copy) @@ -1,50 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.id.IndonesianStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link IndonesianStemFilter}. - *
- * <fieldType name="text_idstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class IndonesianStemFilterFactory extends TokenFilterFactory { - private boolean stemDerivational = true; - - @Override - public void init(Map args) { - super.init(args); - stemDerivational = getBoolean("stemDerivational", true); - } - - public TokenStream create(TokenStream input) { - return new IndonesianStemFilter(input, stemDerivational); - } -} Index: solr/core/src/java/org/apache/solr/analysis/IrishLowerCaseFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/IrishLowerCaseFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/IrishLowerCaseFilterFactory.java (working copy) @@ -1,49 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ga.IrishLowerCaseFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link IrishLowerCaseFilter}. - *
- * <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.IrishLowerCaseFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class IrishLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - - @Override - public TokenStream create(TokenStream input) { - return new IrishLowerCaseFilter(input); - } - - // this will 'mostly work', except for special cases, just like most other filters - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/ItalianLightStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/ItalianLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/ItalianLightStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.it.ItalianLightStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link ItalianLightStemFilter}. - *
- * <fieldType name="text_itlgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.ItalianLightStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class ItalianLightStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new ItalianLightStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/JapaneseBaseFormFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/JapaneseBaseFormFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/JapaneseBaseFormFilterFactory.java (working copy) @@ -1,41 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ja.JapaneseBaseFormFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link org.apache.lucene.analysis.ja.JapaneseBaseFormFilter}. - *
- * <fieldType name="text_ja" class="solr.TextField">
- *   <analyzer>
- *     <tokenizer class="solr.JapaneseTokenizerFactory"/>
- *     <filter class="solr.JapaneseBaseFormFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * 
- */ -public class JapaneseBaseFormFilterFactory extends TokenFilterFactory { - - @Override - public TokenStream create(TokenStream input) { - return new JapaneseBaseFormFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/JapaneseIterationMarkCharFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/JapaneseIterationMarkCharFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/JapaneseIterationMarkCharFilterFactory.java (working copy) @@ -1,65 +0,0 @@ -package org.apache.solr.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.CharFilter; -import org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.CharFilterFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; - -import java.io.Reader; -import java.util.Map; - -/** - * Factory for {@link org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter}. - *
- * <fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false">
- *   <analyzer>
- *     <charFilter class="solr.JapaneseIterationMarkCharFilterFactory normalizeKanji="true" normalizeKana="true"/>
- *     <tokenizer class="solr.JapaneseTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- */ -public class JapaneseIterationMarkCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent { - - private static final String NORMALIZE_KANJI_PARAM = "normalizeKanji"; - - private static final String NORMALIZE_KANA_PARAM = "normalizeKana"; - - private boolean normalizeKanji = true; - - private boolean normalizeKana = true; - - @Override - public CharFilter create(Reader input) { - return new JapaneseIterationMarkCharFilter(input, normalizeKanji, normalizeKana); - } - - @Override - public void init(Map args) { - super.init(args); - normalizeKanji = getBoolean(NORMALIZE_KANJI_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANJI_DEFAULT); - normalizeKana = getBoolean(NORMALIZE_KANA_PARAM, JapaneseIterationMarkCharFilter.NORMALIZE_KANA_DEFAULT); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/JapaneseKatakanaStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/JapaneseKatakanaStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/JapaneseKatakanaStemFilterFactory.java (working copy) @@ -1,55 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import java.util.Map; - -/** - * Factory for {@link JapaneseKatakanaStemFilterFactory}. - *
- * <fieldType name="text_ja" class="solr.TextField">
- *   <analyzer>
- *     <tokenizer class="solr.JapaneseTokenizerFactory"/>
- *     <filter class="solr.JapaneseKatakanaStemFilterFactory"
- *             minimumLength="4"/>
- *   </analyzer>
- * </fieldType>
- * 
- */ -public class JapaneseKatakanaStemFilterFactory extends TokenFilterFactory { - private static final String MINIMUM_LENGTH_PARAM = "minimumLength"; - private int minimumLength; - - @Override - public void init(Map args) { - super.init(args); - minimumLength = getInt(MINIMUM_LENGTH_PARAM, JapaneseKatakanaStemFilter.DEFAULT_MINIMUM_LENGTH); - if (minimumLength < 2) { - throw new InitializationException("Illegal " + MINIMUM_LENGTH_PARAM + " " + minimumLength + " (must be 2 or greater)"); - } - } - - public TokenStream create(TokenStream input) { - return new JapaneseKatakanaStemFilter(input, minimumLength); - } -} Index: solr/core/src/java/org/apache/solr/analysis/JapanesePartOfSpeechStopFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/JapanesePartOfSpeechStopFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/JapanesePartOfSpeechStopFilterFactory.java (working copy) @@ -1,63 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter; -import org.apache.lucene.analysis.util.*; - -/** - * Factory for {@link org.apache.lucene.analysis.ja.JapanesePartOfSpeechStopFilter}. - *
- * <fieldType name="text_ja" class="solr.TextField">
- *   <analyzer>
- *     <tokenizer class="solr.JapaneseTokenizerFactory"/>
- *     <filter class="solr.JapanesePartOfSpeechStopFilterFactory"
- *             tags="stopTags.txt" 
- *             enablePositionIncrements="true"/>
- *   </analyzer>
- * </fieldType>
- * 
- */ -public class JapanesePartOfSpeechStopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private boolean enablePositionIncrements; - private Set stopTags; - - public void inform(ResourceLoader loader) { - String stopTagFiles = args.get("tags"); - enablePositionIncrements = getBoolean("enablePositionIncrements", false); - try { - CharArraySet cas = getWordSet(loader, stopTagFiles, false); - stopTags = new HashSet(); - for (Object element : cas) { - char chars[] = (char[]) element; - stopTags.add(new String(chars)); - } - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading tags", e); - } - } - - public TokenStream create(TokenStream stream) { - return new JapanesePartOfSpeechStopFilter(enablePositionIncrements, stream, stopTags); - } -} Index: solr/core/src/java/org/apache/solr/analysis/JapaneseReadingFormFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/JapaneseReadingFormFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/JapaneseReadingFormFilterFactory.java (working copy) @@ -1,51 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ja.JapaneseReadingFormFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import java.util.Map; - -/** - * Factory for {@link org.apache.lucene.analysis.ja.JapaneseReadingFormFilter}. - *
- * <fieldType name="text_ja" class="solr.TextField">
- *   <analyzer>
- *     <tokenizer class="solr.JapaneseTokenizerFactory"/>
- *     <filter class="solr.JapaneseReadingFormFilterFactory"
- *             useRomaji="false"/>
- *   </analyzer>
- * </fieldType>
- * 
- */ -public class JapaneseReadingFormFilterFactory extends TokenFilterFactory { - private static final String ROMAJI_PARAM = "useRomaji"; - private boolean useRomaji; - - @Override - public void init(Map args) { - super.init(args); - useRomaji = getBoolean(ROMAJI_PARAM, false); - } - - public TokenStream create(TokenStream input) { - return new JapaneseReadingFormFilter(input, useRomaji); - } -} Index: solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/JapaneseTokenizerFactory.java (working copy) @@ -1,108 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; -import java.util.Locale; -import java.util.Map; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.ja.JapaneseTokenizer; -import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; -import org.apache.lucene.analysis.ja.dict.UserDictionary; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.TokenizerFactory; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.lucene.analysis.util.ResourceLoaderAware; - -/** - * Factory for {@link org.apache.lucene.analysis.ja.JapaneseTokenizer}. - *
- * <fieldType name="text_ja" class="solr.TextField">
- *   <analyzer>
- *     <tokenizer class="solr.JapaneseTokenizerFactory"
- *       mode="NORMAL"
- *       userDictionary="user.txt"
- *       userDictionaryEncoding="UTF-8"
- *       discardPunctuation="true"
- *     />
- *     <filter class="solr.JapaneseBaseFormFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * 
- */ -public class JapaneseTokenizerFactory extends TokenizerFactory implements ResourceLoaderAware { - private static final String MODE = "mode"; - - private static final String USER_DICT_PATH = "userDictionary"; - - private static final String USER_DICT_ENCODING = "userDictionaryEncoding"; - - private static final String DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option - - private UserDictionary userDictionary; - - private Mode mode; - - private boolean discardPunctuation; - - @Override - public void inform(ResourceLoader loader) { - mode = getMode(args); - String userDictionaryPath = args.get(USER_DICT_PATH); - try { - if (userDictionaryPath != null) { - InputStream stream = loader.openResource(userDictionaryPath); - String encoding = args.get(USER_DICT_ENCODING); - if (encoding == null) { - encoding = IOUtils.UTF_8; - } - CharsetDecoder decoder = Charset.forName(encoding).newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - Reader reader = new InputStreamReader(stream, decoder); - userDictionary = new UserDictionary(reader); - } else { - userDictionary = null; - } - } catch (Exception e) { - throw new InitializationException("Exception thrown while loading dictionary", e); - } - discardPunctuation = getBoolean(DISCARD_PUNCTUATION, true); - } - - @Override - public Tokenizer create(Reader input) { - return new JapaneseTokenizer(input, userDictionary, discardPunctuation, mode); - } - - private Mode getMode(Map args) { - String mode = args.get(MODE); - if (mode != null) { - return Mode.valueOf(mode.toUpperCase(Locale.ROOT)); - } else { - return JapaneseTokenizer.DEFAULT_MODE; - } - } -} Index: solr/core/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java (working copy) @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.util.*; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.KeepWordFilter; - -import java.util.Map; -import java.util.Set; -import java.io.IOException; - -/** - * Factory for {@link KeepWordFilter}. - *
- * <fieldType name="text_keepword" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false" enablePositionIncrements="false"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class KeepWordFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - } - - public void inform(ResourceLoader loader) { - String wordFiles = args.get("words"); - ignoreCase = getBoolean("ignoreCase", false); - enablePositionIncrements = getBoolean("enablePositionIncrements",false); - - if (wordFiles != null) { - try { - words = getWordSet(loader, wordFiles, ignoreCase); - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading words", e); - } - } - } - - private CharArraySet words; - private boolean ignoreCase; - private boolean enablePositionIncrements; - - /** - * Set the keep word list. - * NOTE: if ignoreCase==true, the words are expected to be lowercase - */ - public void setWords(Set words) { - this.words = new CharArraySet(luceneMatchVersion, words, ignoreCase); - } - - public void setIgnoreCase(boolean ignoreCase) { - if (words != null && this.ignoreCase != ignoreCase) { - words = new CharArraySet(luceneMatchVersion, words, ignoreCase); - } - this.ignoreCase = ignoreCase; - } - - public boolean isEnablePositionIncrements() { - return enablePositionIncrements; - } - - public boolean isIgnoreCase() { - return ignoreCase; - } - - public CharArraySet getWords() { - return words; - } - - public KeepWordFilter create(TokenStream input) { - return new KeepWordFilter(enablePositionIncrements, input, words); - } -} Index: solr/core/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java (working copy) @@ -1,61 +0,0 @@ -package org.apache.solr.analysis; - -import java.io.IOException; - -import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; -import org.apache.lucene.analysis.util.*; -import org.apache.lucene.analysis.TokenStream; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Factory for {@link KeywordMarkerFilter}. - *
- * <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.KeywordMarkerFilterFactory" protected="protectedkeyword.txt" ignoreCase="false"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class KeywordMarkerFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - public static final String PROTECTED_TOKENS = "protected"; - private CharArraySet protectedWords; - private boolean ignoreCase; - - public void inform(ResourceLoader loader) { - String wordFiles = args.get(PROTECTED_TOKENS); - ignoreCase = getBoolean("ignoreCase", false); - if (wordFiles != null) { - try { - protectedWords = getWordSet(loader, wordFiles, ignoreCase); - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading protected words", e); - } - } - } - - public boolean isIgnoreCase() { - return ignoreCase; - } - - public TokenStream create(TokenStream input) { - return protectedWords == null ? input : new KeywordMarkerFilter(input, protectedWords); - } -} Index: solr/core/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java (working copy) @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; - -/** - * Factory for {@link KeywordTokenizer}. - *
- * <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.KeywordTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class KeywordTokenizerFactory extends TokenizerFactory { - public KeywordTokenizer create(Reader input) { - return new KeywordTokenizer(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/KStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/KStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/KStemFilterFactory.java (working copy) @@ -1,33 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.en.KStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link KStemFilter} - */ -public class KStemFilterFactory extends TokenFilterFactory { - - public TokenFilter create(TokenStream input) { - return new KStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/LatvianStemFilterFactory.java (working copy) @@ -1,39 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.lv.LatvianStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link LatvianStemFilter}. - *
- * <fieldType name="text_lvstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.LatvianStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- */ -public class LatvianStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new LatvianStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java (working copy) @@ -20,6 +20,7 @@ import java.io.Reader; +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory; import org.apache.lucene.analysis.util.CharFilterFactory; /** Index: solr/core/src/java/org/apache/solr/analysis/LengthFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/LengthFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/LengthFilterFactory.java (working copy) @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.LengthFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import java.util.Map; - -/** - * Factory for {@link LengthFilter}. - *
- * <fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.LengthFilterFactory" min="0" max="1" enablePositionIncrements="false"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class LengthFilterFactory extends TokenFilterFactory { - int min,max; - boolean enablePositionIncrements; - public static final String MIN_KEY = "min"; - public static final String MAX_KEY = "max"; - - @Override - public void init(Map args) { - super.init(args); - min=Integer.parseInt(args.get(MIN_KEY)); - max=Integer.parseInt(args.get(MAX_KEY)); - enablePositionIncrements = getBoolean("enablePositionIncrements",false); - } - - public LengthFilter create(TokenStream input) { - return new LengthFilter(enablePositionIncrements, input,min,max); - } -} Index: solr/core/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java (working copy) @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.core.LetterTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; -import java.util.Map; - -/** - * Factory for {@link LetterTokenizer}. - *
- * <fieldType name="text_letter" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.LetterTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class LetterTokenizerFactory extends TokenizerFactory { - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - } - - public LetterTokenizer create(Reader input) { - return new LetterTokenizer(luceneMatchVersion, input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/LimitTokenCountFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/LimitTokenCountFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/LimitTokenCountFilterFactory.java (working copy) @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link LimitTokenCountFilter}. - *
- * <fieldType name="text_lngthcnt" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="10"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class LimitTokenCountFilterFactory extends TokenFilterFactory { - - int maxTokenCount; - - @Override - public void init(Map args) { - super.init( args ); - maxTokenCount = Integer.parseInt( args.get( "maxTokenCount" ) ); - } - - @Override - public TokenStream create(TokenStream input) { - return new LimitTokenCountFilter( input, maxTokenCount ); - } - -} Index: solr/core/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java (working copy) @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link LowerCaseFilter}. - *
- * <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class LowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - } - - public LowerCaseFilter create(TokenStream input) { - return new LowerCaseFilter(luceneMatchVersion,input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java (working copy) @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.core.LowerCaseTokenizer; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; -import java.util.Map; - -/** - * Factory for {@link LowerCaseTokenizer}. - *
- * <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.LowerCaseTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class LowerCaseTokenizerFactory extends TokenizerFactory implements MultiTermAwareComponent { - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - } - - public LowerCaseTokenizer create(Reader input) { - return new LowerCaseTokenizer(luceneMatchVersion,input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - LowerCaseFilterFactory filt = new LowerCaseFilterFactory(); - filt.setLuceneMatchVersion(luceneMatchVersion); - filt.init(args); - return filt; - } -} Index: solr/core/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java (working copy) @@ -1,135 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.File; -import java.io.IOException; -import java.io.Reader; -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.lucene.analysis.CharFilter; -import org.apache.lucene.analysis.charfilter.MappingCharFilter; -import org.apache.lucene.analysis.charfilter.NormalizeCharMap; -import org.apache.lucene.analysis.util.*; -import org.apache.solr.common.util.StrUtils; - -/** - * Factory for {@link MappingCharFilter}. - *
- * <fieldType name="text_map" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <charFilter class="solr.MappingCharFilterFactory" mapping="mapping.txt"/>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- * - * - * @since Solr 1.4 - * - */ -public class MappingCharFilterFactory extends CharFilterFactory implements - ResourceLoaderAware, MultiTermAwareComponent { - - protected NormalizeCharMap normMap; - private String mapping; - - public void inform(ResourceLoader loader) { - mapping = args.get( "mapping" ); - - if( mapping != null ){ - List wlist = null; - try{ - File mappingFile = new File( mapping ); - if( mappingFile.exists() ){ - wlist = loader.getLines( mapping ); - } - else{ - List files = StrUtils.splitFileNames( mapping ); - wlist = new ArrayList(); - for( String file : files ){ - List lines = loader.getLines( file.trim() ); - wlist.addAll( lines ); - } - } - } - catch( IOException e ){ - throw new InitializationException("IOException thrown while loading mappings", e); - } - final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); - parseRules( wlist, builder ); - normMap = builder.build(); - } - } - - public CharFilter create(Reader input) { - return new MappingCharFilter(normMap,input); - } - - // "source" => "target" - static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" ); - - protected void parseRules( List rules, NormalizeCharMap.Builder builder ){ - for( String rule : rules ){ - Matcher m = p.matcher( rule ); - if( !m.find() ) - throw new InitializationException("Invalid Mapping Rule : [" + rule + "], file = " + mapping); - builder.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) ); - } - } - - char[] out = new char[256]; - - protected String parseString( String s ){ - int readPos = 0; - int len = s.length(); - int writePos = 0; - while( readPos < len ){ - char c = s.charAt( readPos++ ); - if( c == '\\' ){ - if( readPos >= len ) - throw new InitializationException("Invalid escaped char in [" + s + "]"); - c = s.charAt( readPos++ ); - switch( c ) { - case '\\' : c = '\\'; break; - case '"' : c = '"'; break; - case 'n' : c = '\n'; break; - case 't' : c = '\t'; break; - case 'r' : c = '\r'; break; - case 'b' : c = '\b'; break; - case 'f' : c = '\f'; break; - case 'u' : - if( readPos + 3 >= len ) - throw new InitializationException("Invalid escaped char in [" + s + "]"); - c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 ); - readPos += 4; - break; - } - } - out[writePos++] = c; - } - return new String( out, 0, writePos ); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/NGramFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/NGramFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/NGramFilterFactory.java (working copy) @@ -1,57 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Map; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ngram.NGramTokenFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link NGramTokenFilter}. - *
- * <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class NGramFilterFactory extends TokenFilterFactory { - private int maxGramSize = 0; - - private int minGramSize = 0; - - /** Initialize the n-gram min and max sizes and the side from which one should start tokenizing. */ - @Override - public void init(Map args) { - super.init(args); - String maxArg = args.get("maxGramSize"); - maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) - : NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); - - String minArg = args.get("minGramSize"); - minGramSize = (minArg != null ? Integer.parseInt(minArg) - : NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); - } - - public NGramTokenFilter create(TokenStream input) { - return new NGramTokenFilter(input, minGramSize, maxGramSize); - } -} Index: solr/core/src/java/org/apache/solr/analysis/NGramTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/NGramTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/NGramTokenizerFactory.java (working copy) @@ -1,56 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ngram.NGramTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; -import java.util.Map; - -/** - * Factory for {@link NGramTokenizer}. - *
- * <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="2"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class NGramTokenizerFactory extends TokenizerFactory { - private int maxGramSize = 0; - private int minGramSize = 0; - - /** Initializes the n-gram min and max sizes and the side from which one should start tokenizing. */ - @Override - public void init(Map args) { - super.init(args); - String maxArg = args.get("maxGramSize"); - maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); - - String minArg = args.get("minGramSize"); - minGramSize = (minArg != null ? Integer.parseInt(minArg) : NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); - } - - /** Creates the {@link TokenStream} of n-grams from the given {@link Reader}. */ - public NGramTokenizer create(Reader input) { - return new NGramTokenizer(input, minGramSize, maxGramSize); - } -} Index: solr/core/src/java/org/apache/solr/analysis/NorwegianLightStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/NorwegianLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/NorwegianLightStemFilterFactory.java (working copy) @@ -1,39 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.no.NorwegianLightStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link NorwegianLightStemFilter}. - *
- * <fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.NorwegianLightStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- */ -public class NorwegianLightStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new NorwegianLightStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/NorwegianMinimalStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/NorwegianMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/NorwegianMinimalStemFilterFactory.java (working copy) @@ -1,39 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.no.NorwegianMinimalStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link NorwegianMinimalStemFilter}. - *
- * <fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.NorwegianMinimalStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- */ -public class NorwegianMinimalStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new NorwegianMinimalStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java (working copy) @@ -1,51 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.TokenFilterFactory; -import java.util.Map; - -/** - * Factory for {@link NumericPayloadTokenFilter}. - *
- * <fieldType name="text_numpayload" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.NumericPayloadTokenFilterFactory" payload="24" typeMatch="word"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class NumericPayloadTokenFilterFactory extends TokenFilterFactory { - private float payload; - private String typeMatch; - @Override - public void init(Map args) { - super.init(args); - payload = Float.parseFloat(args.get("payload")); - typeMatch = args.get("typeMatch"); - } - public NumericPayloadTokenFilter create(TokenStream input) { - return new NumericPayloadTokenFilter(input,payload,typeMatch); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/PathHierarchyTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PathHierarchyTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PathHierarchyTokenizerFactory.java (working copy) @@ -1,98 +0,0 @@ -package org.apache.solr.analysis; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.util.Map; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.path.PathHierarchyTokenizer; -import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.TokenizerFactory; - - -/** - * Factory for {@link PathHierarchyTokenizer}. - *
- * <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="\" replace="/"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class PathHierarchyTokenizerFactory extends TokenizerFactory { - - private char delimiter; - private char replacement; - private boolean reverse = false; - private int skip = PathHierarchyTokenizer.DEFAULT_SKIP; - - /** - * Require a configured pattern - */ - @Override - public void init(Map args){ - super.init( args ); - - String v = args.get( "delimiter" ); - if( v != null ){ - if( v.length() != 1 ){ - throw new InitializationException("delimiter should be a char. \"" + v + "\" is invalid"); - } - else{ - delimiter = v.charAt(0); - } - } - else{ - delimiter = PathHierarchyTokenizer.DEFAULT_DELIMITER; - } - - v = args.get( "replace" ); - if( v != null ){ - if( v.length() != 1 ){ - throw new InitializationException("replace should be a char. \"" + v + "\" is invalid"); - } - else{ - replacement = v.charAt(0); - } - } - else{ - replacement = delimiter; - } - - v = args.get( "reverse" ); - if( v != null ){ - reverse = "true".equals( v ); - } - - v = args.get( "skip" ); - if( v != null ){ - skip = Integer.parseInt( v ); - } - } - - public Tokenizer create(Reader input) { - if( reverse ) { - return new ReversePathHierarchyTokenizer(input, delimiter, replacement, skip); - } - return new PathHierarchyTokenizer(input, delimiter, replacement, skip); - } -} - - Index: solr/core/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java (working copy) @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.Reader; -import java.util.Map; -import java.util.regex.Pattern; - -import org.apache.lucene.analysis.CharFilter; -import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter; -import org.apache.lucene.analysis.util.CharFilterFactory; - -/** - * Factory for {@link PatternReplaceCharFilter}. - *
- * <fieldType name="text_ptnreplace" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <charFilter class="solr.PatternReplaceCharFilterFactory" 
- *                    pattern="([^a-z])" replacement=""/>
- *     <tokenizer class="solr.KeywordTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- * - * - * @since Solr 3.1 - */ -public class PatternReplaceCharFilterFactory extends CharFilterFactory { - - private Pattern p; - private String replacement; - - @Override - public void init(Map args) { - super.init( args ); - p = getPattern("pattern"); - replacement = args.get( "replacement" ); - if( replacement == null ) - replacement = ""; - // TODO: throw exception if you set maxBlockChars or blockDelimiters ? - } - - public CharFilter create(Reader input) { - return new PatternReplaceCharFilter( p, replacement, input ); - } -} Index: solr/core/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java (working copy) @@ -1,71 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.pattern.PatternReplaceFilter; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import java.util.Map; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -/** - * Factory for {@link PatternReplaceFilter}. - *
- * <fieldType name="text_ptnreplace" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.KeywordTokenizerFactory"/>
- *     <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" replacement=""
- *             replace="all"/>
- *   </analyzer>
- * </fieldType>
- * - * @see PatternReplaceFilter - */ -public class PatternReplaceFilterFactory extends TokenFilterFactory { - Pattern p; - String replacement; - boolean all = true; - - @Override - public void init(Map args) { - super.init(args); - p = getPattern("pattern"); - replacement = args.get("replacement"); - - String r = args.get("replace"); - if (null != r) { - if (r.equals("all")) { - all = true; - } else { - if (r.equals("first")) { - all = false; - } else { - throw new InitializationException - ("Configuration Error: 'replace' must be 'first' or 'all' in " - + this.getClass().getName()); - } - } - } - - } - public PatternReplaceFilter create(TokenStream input) { - return new PatternReplaceFilter(input, p, replacement, all); - } -} Index: solr/core/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java (working copy) @@ -1,107 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.IOException; -import java.io.Reader; -import java.util.Map; -import java.util.regex.Pattern; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.pattern.PatternTokenizer; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.TokenizerFactory; - - -/** - * Factory for {@link PatternTokenizer}. - * This tokenizer uses regex pattern matching to construct distinct tokens - * for the input stream. It takes two arguments: "pattern" and "group". - *

- *

    - *
  • "pattern" is the regular expression.
  • - *
  • "group" says which group to extract into tokens.
  • - *
- *

- * group=-1 (the default) is equivalent to "split". In this case, the tokens will - * be equivalent to the output from (without empty tokens): - * {@link String#split(java.lang.String)} - *

- *

- * Using group >= 0 selects the matching group as the token. For example, if you have:
- *

- *  pattern = \'([^\']+)\'
- *  group = 0
- *  input = aaa 'bbb' 'ccc'
- *
- * the output will be two tokens: 'bbb' and 'ccc' (including the ' marks). With the same input - * but using group=1, the output would be: bbb and ccc (no ' marks) - *

- *

NOTE: This Tokenizer does not output tokens that are of zero length.

- * - *
- * <fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.PatternTokenizerFactory" pattern="\'([^\']+)\'" group="1"/>
- *   </analyzer>
- * </fieldType>
- * - * @see PatternTokenizer - * @since solr1.2 - * - */ -public class PatternTokenizerFactory extends TokenizerFactory -{ - public static final String PATTERN = "pattern"; - public static final String GROUP = "group"; - - protected Pattern pattern; - protected int group; - - /** - * Require a configured pattern - */ - @Override - public void init(Map args) - { - super.init(args); - pattern = getPattern( PATTERN ); - - group = -1; // use 'split' - String g = args.get( GROUP ); - if( g != null ) { - try { - group = Integer.parseInt( g ); - } - catch( Exception ex ) { - throw new InitializationException("invalid group argument: " + g); - } - } - } - - /** - * Split the input using configured pattern - */ - public Tokenizer create(final Reader in) { - try { - return new PatternTokenizer(in, pattern, group); - } catch( IOException ex ) { - throw new InitializationException("IOException thrown creating PatternTokenizer instance", ex); - } - } -} Index: solr/core/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java (working copy) @@ -1,50 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; - -import org.apache.lucene.analysis.CharFilter; -import org.apache.lucene.analysis.fa.PersianCharFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.CharFilterFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; - -/** - * Factory for {@link PersianCharFilter}. - *
- * <fieldType name="text_fa" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <charFilter class="solr.PersianCharFilterFactory"/>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class PersianCharFilterFactory extends CharFilterFactory implements MultiTermAwareComponent { - - @Override - public CharFilter create(Reader input) { - return new PersianCharFilter(input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java (working copy) @@ -1,50 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.fa.PersianNormalizationFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link PersianNormalizationFilter}. - *
- * <fieldType name="text_fanormal" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <charFilter class="solr.PersianCharFilterFactory"/>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.PersianNormalizationFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class PersianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - public PersianNormalizationFilter create(TokenStream input) { - return new PersianNormalizationFilter(input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} - Index: solr/core/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java (working copy) @@ -1,149 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.lang.reflect.Method; -import java.lang.reflect.InvocationTargetException; -import java.util.HashMap; -import java.util.Locale; -import java.util.Map; - -import org.apache.commons.codec.Encoder; -import org.apache.commons.codec.language.*; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.phonetic.PhoneticFilter; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link PhoneticFilter}. - * - * Create tokens based on phonetic encoders from Apache Commons Codec. - *

- * This takes one required argument, "encoder", and the rest are optional: - *

- *
encoder
required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0), - * or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by - * itself if it already contains a '.' or otherwise as in the same package as these others. - *
inject
(default=true) add tokens to the stream with the offset=0 - *
maxCodeLength
The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't - * support this then specifying this is an error. - *
- * - *
- * <fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.PhoneticFilterFactory" encoder="DoubleMetaphone" inject="true"/>
- *   </analyzer>
- * </fieldType>
- * - * @see PhoneticFilter - */ -public class PhoneticFilterFactory extends TokenFilterFactory -{ - public static final String ENCODER = "encoder"; - public static final String INJECT = "inject"; // boolean - public static final String MAX_CODE_LENGTH = "maxCodeLength"; - private static final String PACKAGE_CONTAINING_ENCODERS = "org.apache.commons.codec.language."; - - //Effectively constants; uppercase keys - private static final Map> registry = new HashMap>(6); - - static { - registry.put("DoubleMetaphone".toUpperCase(Locale.ROOT), DoubleMetaphone.class); - registry.put("Metaphone".toUpperCase(Locale.ROOT), Metaphone.class); - registry.put("Soundex".toUpperCase(Locale.ROOT), Soundex.class); - registry.put("RefinedSoundex".toUpperCase(Locale.ROOT), RefinedSoundex.class); - registry.put("Caverphone".toUpperCase(Locale.ROOT), Caverphone2.class); - registry.put("ColognePhonetic".toUpperCase(Locale.ROOT), ColognePhonetic.class); - } - - protected boolean inject = true; - protected String name = null; - protected Class clazz = null; - protected Method setMaxCodeLenMethod = null; - protected Integer maxCodeLength = null; - - @Override - public void init(Map args) { - super.init( args ); - - inject = getBoolean(INJECT, true); - - String name = args.get( ENCODER ); - if( name == null ) { - throw new InitializationException("Missing required parameter: " + ENCODER - + " [" + registry.keySet() + "]"); - } - clazz = registry.get(name.toUpperCase(Locale.ROOT)); - if( clazz == null ) { - clazz = resolveEncoder(name); - } - - String v = args.get(MAX_CODE_LENGTH); - if (v != null) { - maxCodeLength = Integer.valueOf(v); - try { - setMaxCodeLenMethod = clazz.getMethod("setMaxCodeLen", int.class); - } catch (Exception e) { - throw new InitializationException("Encoder " + name + " / " + clazz + " does not support " + MAX_CODE_LENGTH, e); - } - } - - getEncoder();//trigger initialization for potential problems to be thrown now - } - - private Class resolveEncoder(String name) { - String lookupName = name; - if (name.indexOf('.') == -1) { - lookupName = PACKAGE_CONTAINING_ENCODERS + name; - } - try { - return Class.forName(lookupName).asSubclass(Encoder.class); - } catch (ClassNotFoundException cnfe) { - throw new InitializationException("Unknown encoder: " + name + " must be full class name or one of " + registry.keySet(), cnfe); - } catch (ClassCastException e) { - throw new InitializationException("Not an encoder: " + name + " must be full class name or one of " + registry.keySet(), e); - } - } - - /** Must be thread-safe. */ - protected Encoder getEncoder() { - // Unfortunately, Commons-Codec doesn't offer any thread-safe guarantees so we must play it safe and instantiate - // every time. A simple benchmark showed this as negligible. - try { - Encoder encoder = clazz.newInstance(); - // Try to set the maxCodeLength - if(maxCodeLength != null && setMaxCodeLenMethod != null) { - setMaxCodeLenMethod.invoke(encoder, maxCodeLength); - } - return encoder; - } catch (Exception e) { - final Throwable t = (e instanceof InvocationTargetException) ? e.getCause() : e; - throw new InitializationException("Error initializing encoder: " + name + " / " + clazz, t); - } - } - - public PhoneticFilter create(TokenStream input) { - return new PhoneticFilter(input, getEncoder(), inject); - } - -} Index: solr/core/src/java/org/apache/solr/analysis/PorterStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PorterStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PorterStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.en.PorterStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link PorterStemFilter}. - *
- * <fieldType name="text_porterstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.PorterStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class PorterStemFilterFactory extends TokenFilterFactory { - public PorterStemFilter create(TokenStream input) { - return new PorterStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/PortugueseLightStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PortugueseLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PortugueseLightStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.pt.PortugueseLightStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link PortugueseLightStemFilter}. - *
- * <fieldType name="text_ptlgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.PortugueseLightStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class PortugueseLightStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new PortugueseLightStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/PortugueseMinimalStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PortugueseMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PortugueseMinimalStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.pt.PortugueseMinimalStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link PortugueseMinimalStemFilter}. - *
- * <fieldType name="text_ptminstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.PortugueseMinimalStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class PortugueseMinimalStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new PortugueseMinimalStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/PortugueseStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PortugueseStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PortugueseStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.pt.PortugueseStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link PortugueseStemFilter}. - *
- * <fieldType name="text_ptstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.PortugueseStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class PortugueseStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new PortugueseStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/PositionFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/PositionFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/PositionFilterFactory.java (working copy) @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.position.PositionFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import java.util.Map; - -/** - * Factory for {@link PositionFilter}. - * Set the positionIncrement of all tokens to the "positionIncrement", except the first return token which retains its - * original positionIncrement value. The default positionIncrement value is zero. - *
- * <fieldType name="text_position" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.PositionFilterFactory" positionIncrement="0"/>
- *   </analyzer>
- * </fieldType>
- * - * - * @see org.apache.lucene.analysis.position.PositionFilter - * @since solr 1.4 - */ -public class PositionFilterFactory extends TokenFilterFactory { - private int positionIncrement; - - @Override - public void init(Map args) { - super.init(args); - positionIncrement = getInt("positionIncrement", 0); - } - - public PositionFilter create(TokenStream input) { - return new PositionFilter(input, positionIncrement); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java (working copy) @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link RemoveDuplicatesTokenFilter}. - *
- * <fieldType name="text_rmdup" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class RemoveDuplicatesTokenFilterFactory extends TokenFilterFactory { - public RemoveDuplicatesTokenFilter create(TokenStream input) { - return new RemoveDuplicatesTokenFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/ReverseStringFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/ReverseStringFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/ReverseStringFilterFactory.java (working copy) @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.reverse.ReverseStringFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link ReverseStringFilter}. - *
- * <fieldType name="text_rvsstr" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.ReverseStringFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - * - * @since solr 1.4 - */ -public class ReverseStringFilterFactory extends TokenFilterFactory { - public ReverseStringFilter create(TokenStream in) { - assureMatchVersion(); - return new ReverseStringFilter(luceneMatchVersion,in); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/RussianLightStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/RussianLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/RussianLightStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.ru.RussianLightStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link RussianLightStemFilter}. - *
- * <fieldType name="text_rulgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.RussianLightStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class RussianLightStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new RussianLightStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/ShingleFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/ShingleFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/ShingleFilterFactory.java (working copy) @@ -1,82 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.shingle.ShingleFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import java.util.Map; - -/** - * Factory for {@link ShingleFilter}. - *
- * <fieldType name="text_shingle" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
- *             outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class ShingleFilterFactory extends TokenFilterFactory { - private int minShingleSize; - private int maxShingleSize; - private boolean outputUnigrams; - private boolean outputUnigramsIfNoShingles; - private String tokenSeparator; - - @Override - public void init(Map args) { - super.init(args); - maxShingleSize = getInt("maxShingleSize", - ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); - if (maxShingleSize < 2) { - throw new InitializationException("Invalid maxShingleSize (" + maxShingleSize - + ") - must be at least 2"); - } - minShingleSize = getInt("minShingleSize", - ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); - if (minShingleSize < 2) { - throw new InitializationException("Invalid minShingleSize (" + minShingleSize - + ") - must be at least 2"); - } - if (minShingleSize > maxShingleSize) { - throw new InitializationException("Invalid minShingleSize (" + minShingleSize - + ") - must be no greater than maxShingleSize (" - + maxShingleSize + ")"); - } - outputUnigrams = getBoolean("outputUnigrams", true); - outputUnigramsIfNoShingles = getBoolean("outputUnigramsIfNoShingles", false); - tokenSeparator = args.containsKey("tokenSeparator") - ? args.get("tokenSeparator") - : ShingleFilter.TOKEN_SEPARATOR; - } - public ShingleFilter create(TokenStream input) { - ShingleFilter r = new ShingleFilter(input, minShingleSize, maxShingleSize); - r.setOutputUnigrams(outputUnigrams); - r.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); - r.setTokenSeparator(tokenSeparator); - return r; - } -} - Index: solr/core/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java (working copy) @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.analysis; - -import java.util.Map; -import java.io.IOException; - -import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.snowball.SnowballFilter; -import org.apache.lucene.analysis.util.*; -import org.tartarus.snowball.SnowballProgram; - -/** - * Factory for {@link SnowballFilter}, with configurable language - *

- * Note: Use of the "Lovins" stemmer is not recommended, as it is implemented with reflection. - *

- * <fieldType name="text_snowballstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.SnowballPorterFilterFactory" protected="protectedkeyword.txt" language="English"/>
- *   </analyzer>
- * </fieldType>
- * - * - */ -public class SnowballPorterFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - public static final String PROTECTED_TOKENS = "protected"; - - private String language = "English"; - private Class stemClass; - - - public void inform(ResourceLoader loader) { - String wordFiles = args.get(PROTECTED_TOKENS); - if (wordFiles != null) { - try { - protectedWords = getWordSet(loader, wordFiles, false); - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading protected words", e); - } - } - } - - private CharArraySet protectedWords = null; - - @Override - public void init(Map args) { - super.init(args); - final String cfgLanguage = args.get("language"); - if(cfgLanguage!=null) language = cfgLanguage; - - try { - stemClass = Class.forName("org.tartarus.snowball.ext." + language + "Stemmer"); - } catch (ClassNotFoundException e) { - throw new InitializationException("Can't find class for stemmer language " + language, e); - } - } - - public TokenFilter create(TokenStream input) { - SnowballProgram program; - try { - program = (SnowballProgram)stemClass.newInstance(); - } catch (Exception e) { - throw new InitializationException("Error instantiating stemmer for language " + language + "from class " + stemClass, e); - } - - if (protectedWords != null) - input = new KeywordMarkerFilter(input, protectedWords); - return new SnowballFilter(input, program); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/SpanishLightStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/SpanishLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/SpanishLightStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.es.SpanishLightStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link SpanishLightStemFilter}. - *
- * <fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.SpanishLightStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class SpanishLightStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new SpanishLightStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/StandardFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/StandardFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/StandardFilterFactory.java (working copy) @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link StandardFilter}. - *
- * <fieldType name="text_stndrd" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.StandardFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class StandardFilterFactory extends TokenFilterFactory { - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - } - - public StandardFilter create(TokenStream input) { - return new StandardFilter(luceneMatchVersion, input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java (working copy) @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.standard.StandardTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; -import java.util.Map; - -/** - * Factory for {@link StandardTokenizer}. - *
- * <fieldType name="text_stndrd" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory" maxTokenLength="255"/>
- *   </analyzer>
- * </fieldType>
- * - */ - -public class StandardTokenizerFactory extends TokenizerFactory { - - private int maxTokenLength; - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - maxTokenLength = getInt("maxTokenLength", - StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); - } - - public StandardTokenizer create(Reader input) { - StandardTokenizer tokenizer - = new StandardTokenizer(luceneMatchVersion, input); - tokenizer.setMaxTokenLength(maxTokenLength); - return tokenizer; - } -} Index: solr/core/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java (working copy) @@ -1,74 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.List; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter; -import org.apache.lucene.analysis.util.*; -import org.apache.solr.common.util.StrUtils; - -/** - * Factory for {@link StemmerOverrideFilter}. - *
- * <fieldType name="text_dicstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class StemmerOverrideFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private CharArrayMap dictionary = null; - private boolean ignoreCase; - - public void inform(ResourceLoader loader) { - String dictionaryFiles = args.get("dictionary"); - ignoreCase = getBoolean("ignoreCase", false); - if (dictionaryFiles != null) { - assureMatchVersion(); - List files = StrUtils.splitFileNames(dictionaryFiles); - try { - if (files.size() > 0) { - dictionary = new CharArrayMap(luceneMatchVersion, - files.size() * 10, ignoreCase); - for (String file : files) { - List list = loader.getLines(file.trim()); - for (String line : list) { - String[] mapping = line.split("\t", 2); - dictionary.put(mapping[0], mapping[1]); - } - } - } - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading dictionary", e); - } - } - } - - public boolean isIgnoreCase() { - return ignoreCase; - } - - public TokenStream create(TokenStream input) { - return dictionary == null ? input : new StemmerOverrideFilter(luceneMatchVersion, input, dictionary); - } -} Index: solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/StopFilterFactory.java (working copy) @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.util.*; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.StopAnalyzer; -import org.apache.lucene.analysis.core.StopFilter; - -import java.util.Map; -import java.io.IOException; - -/** - * Factory for {@link StopFilter}. - *
- * <fieldType name="text_stop" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.StopFilterFactory" ignoreCase="true"
- *             words="stopwords.txt" enablePositionIncrements="true"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class StopFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - } - - @Override - public void inform(ResourceLoader loader) { - String stopWordFiles = args.get("words"); - ignoreCase = getBoolean("ignoreCase",false); - enablePositionIncrements = getBoolean("enablePositionIncrements",false); - - if (stopWordFiles != null) { - try { - if ("snowball".equalsIgnoreCase(args.get("format"))) { - stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase); - } else { - stopWords = getWordSet(loader, stopWordFiles, ignoreCase); - } - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading stopwords", e); - } - } else { - stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); - } - } - - private CharArraySet stopWords; - private boolean ignoreCase; - private boolean enablePositionIncrements; - - public boolean isEnablePositionIncrements() { - return enablePositionIncrements; - } - - public boolean isIgnoreCase() { - return ignoreCase; - } - - public CharArraySet getStopWords() { - return stopWords; - } - - @Override - public TokenStream create(TokenStream input) { - StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords); - stopFilter.setEnablePositionIncrements(enablePositionIncrements); - return stopFilter; - } -} Index: solr/core/src/java/org/apache/solr/analysis/SwedishLightStemFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/SwedishLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/SwedishLightStemFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.sv.SwedishLightStemFilter; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link SwedishLightStemFilter}. - *
- * <fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.LowerCaseFilterFactory"/>
- *     <filter class="solr.SwedishLightStemFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class SwedishLightStemFilterFactory extends TokenFilterFactory { - public TokenStream create(TokenStream input) { - return new SwedishLightStemFilter(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/SynonymFilterFactory.java (working copy) @@ -1,174 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.File; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CodingErrorAction; -import java.text.ParseException; -import java.util.List; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.analysis.synonym.SynonymFilter; -import org.apache.lucene.analysis.synonym.SynonymMap; -import org.apache.lucene.analysis.synonym.SolrSynonymParser; -import org.apache.lucene.analysis.synonym.WordnetSynonymParser; -import org.apache.lucene.analysis.util.*; -import org.apache.lucene.util.Version; -import org.apache.solr.common.util.StrUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Factory for {@link SynonymFilter}. - *
- * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
- *             format="solr" ignoreCase="false" expand="true" 
- *             tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- */ -public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - - public static final Logger log = LoggerFactory.getLogger(SynonymFilterFactory.class); - - private SynonymMap map; - private boolean ignoreCase; - - @Override - public TokenStream create(TokenStream input) { - // if the fst is null, it means there's actually no synonyms... just return the original stream - // as there is nothing to do here. - return map.fst == null ? input : new SynonymFilter(input, map, ignoreCase); - } - - @Override - public void inform(ResourceLoader loader) { - final boolean ignoreCase = getBoolean("ignoreCase", false); - this.ignoreCase = ignoreCase; - - String tf = args.get("tokenizerFactory"); - - final TokenizerFactory factory = tf == null ? null : loadTokenizerFactory(loader, tf); - - Analyzer analyzer = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_50, reader) : factory.create(reader); - TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_50, tokenizer) : tokenizer; - return new TokenStreamComponents(tokenizer, stream); - } - }; - - String format = args.get("format"); - try { - if (format == null || format.equals("solr")) { - // TODO: expose dedup as a parameter? - map = loadSolrSynonyms(loader, true, analyzer); - } else if (format.equals("wordnet")) { - map = loadWordnetSynonyms(loader, true, analyzer); - } else { - // TODO: somehow make this more pluggable - throw new InitializationException("Unrecognized synonyms format: " + format); - } - } catch (Exception e) { - throw new InitializationException("Exception thrown while loading synonyms", e); - } - - if (map.fst == null) { - log.warn("Synonyms loaded with " + args + " has empty rule set!"); - } - } - - /** - * Load synonyms from the solr format, "format=solr". - */ - private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { - final boolean expand = getBoolean("expand", true); - String synonyms = args.get("synonyms"); - if (synonyms == null) - throw new InitializationException("Missing required argument 'synonyms'."); - - CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - - SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer); - File synonymFile = new File(synonyms); - if (synonymFile.exists()) { - decoder.reset(); - parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); - } else { - List files = StrUtils.splitFileNames(synonyms); - for (String file : files) { - decoder.reset(); - parser.add(new InputStreamReader(loader.openResource(file), decoder)); - } - } - return parser.build(); - } - - /** - * Load synonyms from the wordnet format, "format=wordnet". - */ - private SynonymMap loadWordnetSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { - final boolean expand = getBoolean("expand", true); - String synonyms = args.get("synonyms"); - if (synonyms == null) - throw new InitializationException("Missing required argument 'synonyms'."); - - CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT); - - WordnetSynonymParser parser = new WordnetSynonymParser(dedup, expand, analyzer); - File synonymFile = new File(synonyms); - if (synonymFile.exists()) { - decoder.reset(); - parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); - } else { - List files = StrUtils.splitFileNames(synonyms); - for (String file : files) { - decoder.reset(); - parser.add(new InputStreamReader(loader.openResource(file), decoder)); - } - } - return parser.build(); - } - - private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname){ - TokenizerFactory tokFactory = loader.newInstance(cname, TokenizerFactory.class); - tokFactory.setLuceneMatchVersion(luceneMatchVersion); - tokFactory.init(args); - if (tokFactory instanceof ResourceLoaderAware) { - ((ResourceLoaderAware) tokFactory).inform(loader); - } - return tokFactory; - } -} Index: solr/core/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java (working copy) @@ -1,43 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; -import org.apache.lucene.analysis.th.ThaiWordFilter; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link ThaiWordFilter}. - *
- * <fieldType name="text_thai" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.ThaiWordFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class ThaiWordFilterFactory extends TokenFilterFactory { - public ThaiWordFilter create(TokenStream input) { - assureMatchVersion(); - return new ThaiWordFilter(luceneMatchVersion, input); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java (working copy) @@ -1,42 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link TokenOffsetPayloadTokenFilter}. - *
- * <fieldType name="text_tokenoffset" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.TokenOffsetPayloadTokenFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class TokenOffsetPayloadTokenFilterFactory extends TokenFilterFactory { - public TokenOffsetPayloadTokenFilter create(TokenStream input) { - return new TokenOffsetPayloadTokenFilter(input); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/TrimFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/TrimFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/TrimFilterFactory.java (working copy) @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.TrimFilter; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link TrimFilter}. - *
- * <fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.NGramTokenizerFactory"/>
- *     <filter class="solr.TrimFilterFactory" updateOffsets="false"/>
- *   </analyzer>
- * </fieldType>
- * - * @see TrimFilter - */ -public class TrimFilterFactory extends TokenFilterFactory { - - protected boolean updateOffsets = false; - - @Override - public void init(Map args) { - super.init( args ); - - String v = args.get( "updateOffsets" ); - if( v != null ) { - try { - updateOffsets = Boolean.valueOf( v ); - } - catch( Exception ex ) { - throw new InitializationException("Error reading updateOffsets value. Must be true or false.", ex); - } - } - } - - public TrimFilter create(TokenStream input) { - return new TrimFilter(input, updateOffsets); - } -} Index: solr/core/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; -import org.apache.lucene.analysis.util.AbstractAnalysisFactory; -import org.apache.lucene.analysis.util.MultiTermAwareComponent; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link TurkishLowerCaseFilter}. - *
- * <fieldType name="text_trlwr" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.TurkishLowerCaseFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class TurkishLowerCaseFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { - public TokenStream create(TokenStream input) { - return new TurkishLowerCaseFilter(input); - } - - @Override - public AbstractAnalysisFactory getMultiTermComponent() { - return this; - } -} Index: solr/core/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java (working copy) @@ -1,42 +0,0 @@ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * Factory for {@link TypeAsPayloadTokenFilter}. - *
- * <fieldType name="text_typeaspayload" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.TypeAsPayloadTokenFilterFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class TypeAsPayloadTokenFilterFactory extends TokenFilterFactory { - public TypeAsPayloadTokenFilter create(TokenStream input) { - return new TypeAsPayloadTokenFilter(input); - } -} - Index: solr/core/src/java/org/apache/solr/analysis/TypeTokenFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/TypeTokenFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/TypeTokenFilterFactory.java (working copy) @@ -1,85 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.core.TypeTokenFilter; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.common.util.StrUtils; -import org.apache.lucene.analysis.util.ResourceLoaderAware; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -import java.io.IOException; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -/** - * Factory class for {@link TypeTokenFilter}. - *
- * <fieldType name="chars" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.StandardTokenizerFactory"/>
- *     <filter class="solr.TypeTokenFilterFactory" types="stoptypes.txt"
- *                   enablePositionIncrements="true" useWhiteList="false"/>
- *   </analyzer>
- * </fieldType>
- */ -public class TypeTokenFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - - @Override - public void inform(ResourceLoader loader) { - String stopTypesFiles = args.get("types"); - enablePositionIncrements = getBoolean("enablePositionIncrements", false); - useWhitelist = getBoolean("useWhitelist", false); - if (stopTypesFiles != null) { - try { - List files = StrUtils.splitFileNames(stopTypesFiles); - if (files.size() > 0) { - stopTypes = new HashSet(); - for (String file : files) { - List typesLines = loader.getLines(file.trim()); - stopTypes.addAll(typesLines); - } - } - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading types", e); - } - } else { - throw new InitializationException("Missing required parameter: types."); - } - } - - private boolean useWhitelist; - private Set stopTypes; - private boolean enablePositionIncrements; - - public boolean isEnablePositionIncrements() { - return enablePositionIncrements; - } - - public Set getStopTypes() { - return stopTypes; - } - - @Override - public TokenStream create(TokenStream input) { - return new TypeTokenFilter(enablePositionIncrements, input, stopTypes, useWhitelist); - } -} Index: solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java (working copy) @@ -1,59 +0,0 @@ -package org.apache.solr.analysis; - - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - - -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; -import java.util.Map; - -/** - * Factory for {@link UAX29URLEmailTokenizer}. - *
- * <fieldType name="text_urlemail" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.UAX29URLEmailTokenizerFactory" maxTokenLength="255"/>
- *   </analyzer>
- * </fieldType>
- * - * - */ - -public class UAX29URLEmailTokenizerFactory extends TokenizerFactory { - - private int maxTokenLength; - - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - maxTokenLength = getInt("maxTokenLength", - StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); - } - - public UAX29URLEmailTokenizer create(Reader input) { - UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(luceneMatchVersion, input); - tokenizer.setMaxTokenLength(maxTokenLength); - return tokenizer; - } -} Index: solr/core/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java (working copy) @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.core.WhitespaceTokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; - -import java.io.Reader; -import java.util.Map; - -/** - * Factory for {@link WhitespaceTokenizer}. - *
- * <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class WhitespaceTokenizerFactory extends TokenizerFactory { - @Override - public void init(Map args) { - super.init(args); - assureMatchVersion(); - } - - public WhitespaceTokenizer create(Reader input) { - return new WhitespaceTokenizer(luceneMatchVersion,input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/WikipediaTokenizerFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/WikipediaTokenizerFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/WikipediaTokenizerFactory.java (working copy) @@ -1,41 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; - -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.TokenizerFactory; -import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; - -/** - * Factory for {@link WikipediaTokenizer}. - *
- * <fieldType name="text_wiki" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WikipediaTokenizerFactory"/>
- *   </analyzer>
- * </fieldType>
- * - */ -public class WikipediaTokenizerFactory extends TokenizerFactory { - // TODO: add support for WikipediaTokenizer's advanced options. - public Tokenizer create(Reader input) { - return new WikipediaTokenizer(input); - } -} Index: solr/core/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java =================================================================== --- solr/core/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java (working copy) @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; -import org.apache.lucene.analysis.util.*; - -import org.apache.solr.common.util.StrUtils; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.io.IOException; - -import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*; - - -/** - * Factory for {@link WordDelimiterFilter}. - *
- * <fieldType name="text_wd" class="solr.TextField" positionIncrementGap="100">
- *   <analyzer>
- *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.WordDelimiterFilterFactory" protected="protectedword.txt"
- *             preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
- *             catenateWords="0" catenateNumbers="0" catenateAll="0"
- *             generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
- *             types="wdfftypes.txt" />
- *   </analyzer>
- * </fieldType>
- * - */ -public class WordDelimiterFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - public static final String PROTECTED_TOKENS = "protected"; - public static final String TYPES = "types"; - - public void inform(ResourceLoader loader) { - String wordFiles = args.get(PROTECTED_TOKENS); - if (wordFiles != null) { - try { - protectedWords = getWordSet(loader, wordFiles, false); - } catch (IOException e) { - throw new InitializationException("IOException thrown while loading protected words", e); - } - } - String types = args.get(TYPES); - if (types != null) { - try { - List files = StrUtils.splitFileNames( types ); - List wlist = new ArrayList(); - for( String file : files ){ - List lines = loader.getLines( file.trim() ); - wlist.addAll( lines ); - } - typeTable = parseTypes(wlist); - } catch (IOException e) { - throw new InitializationException("IOException while loading types", e); - } - } - } - - private CharArraySet protectedWords = null; - private int flags; - byte[] typeTable = null; - - @Override - public void init(Map args) { - super.init(args); - if (getInt("generateWordParts", 1) != 0) { - flags |= GENERATE_WORD_PARTS; - } - if (getInt("generateNumberParts", 1) != 0) { - flags |= GENERATE_NUMBER_PARTS; - } - if (getInt("catenateWords", 0) != 0) { - flags |= CATENATE_WORDS; - } - if (getInt("catenateNumbers", 0) != 0) { - flags |= CATENATE_NUMBERS; - } - if (getInt("catenateAll", 0) != 0) { - flags |= CATENATE_ALL; - } - if (getInt("splitOnCaseChange", 1) != 0) { - flags |= SPLIT_ON_CASE_CHANGE; - } - if (getInt("splitOnNumerics", 1) != 0) { - flags |= SPLIT_ON_NUMERICS; - } - if (getInt("preserveOriginal", 0) != 0) { - flags |= PRESERVE_ORIGINAL; - } - if (getInt("stemEnglishPossessive", 1) != 0) { - flags |= STEM_ENGLISH_POSSESSIVE; - } - } - - public WordDelimiterFilter create(TokenStream input) { - return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, - flags, protectedWords); - } - - // source => type - private static Pattern typePattern = Pattern.compile( "(.*)\\s*=>\\s*(.*)\\s*$" ); - - // parses a list of MappingCharFilter style rules into a custom byte[] type table - private byte[] parseTypes(List rules) { - SortedMap typeMap = new TreeMap(); - for( String rule : rules ){ - Matcher m = typePattern.matcher(rule); - if( !m.find() ) - throw new InitializationException("Invalid Mapping Rule : [" + rule + "]"); - String lhs = parseString(m.group(1).trim()); - Byte rhs = parseType(m.group(2).trim()); - if (lhs.length() != 1) - throw new InitializationException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); - if (rhs == null) - throw new InitializationException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); - typeMap.put(lhs.charAt(0), rhs); - } - - // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance - byte types[] = new byte[Math.max(typeMap.lastKey()+1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; - for (int i = 0; i < types.length; i++) - types[i] = WordDelimiterIterator.getType(i); - for (Map.Entry mapping : typeMap.entrySet()) - types[mapping.getKey()] = mapping.getValue(); - return types; - } - - private Byte parseType(String s) { - if (s.equals("LOWER")) - return LOWER; - else if (s.equals("UPPER")) - return UPPER; - else if (s.equals("ALPHA")) - return ALPHA; - else if (s.equals("DIGIT")) - return DIGIT; - else if (s.equals("ALPHANUM")) - return ALPHANUM; - else if (s.equals("SUBWORD_DELIM")) - return SUBWORD_DELIM; - else - return null; - } - - char[] out = new char[256]; - - private String parseString(String s){ - int readPos = 0; - int len = s.length(); - int writePos = 0; - while( readPos < len ){ - char c = s.charAt( readPos++ ); - if( c == '\\' ){ - if( readPos >= len ) - throw new InitializationException("Invalid escaped char in [" + s + "]"); - c = s.charAt( readPos++ ); - switch( c ) { - case '\\' : c = '\\'; break; - case 'n' : c = '\n'; break; - case 't' : c = '\t'; break; - case 'r' : c = '\r'; break; - case 'b' : c = '\b'; break; - case 'f' : c = '\f'; break; - case 'u' : - if( readPos + 3 >= len ) - throw new InitializationException("Invalid escaped char in [" + s + "]"); - c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 ); - readPos += 4; - break; - } - } - out[writePos++] = c; - } - return new String( out, 0, writePos ); - } -} Index: solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java =================================================================== --- solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/core/SolrResourceLoader.java (working copy) @@ -29,11 +29,16 @@ import java.net.URLClassLoader; import java.util.*; import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.ResourceLoaderAware; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; +import org.apache.lucene.analysis.util.AnalysisSPILoader; +import org.apache.lucene.util.WeakIdentityMap; import org.apache.solr.common.ResourceLoader; import org.apache.solr.handler.admin.CoreAdminHandler; import org.apache.solr.handler.component.ShardHandlerFactory; @@ -366,8 +371,15 @@ /* * A static map of short class name to fully qualified class name */ - private static Map classNameCache = new ConcurrentHashMap(); + private static final Map classNameCache = new ConcurrentHashMap(); + // A static map of AnalysisSPILoaders, keyed by ClassLoader used (because it can change during Solr lifetime) and expected base class: + private static final WeakIdentityMap,AnalysisSPILoader>> expectedTypesSPILoaders = WeakIdentityMap.newConcurrentHashMap(); + + // Using this pattern, legacy analysis components from previous Solr versions are identified and delegated to SPI loader: + private static final Pattern legacyAnalysisPattern = + Pattern.compile("((\\Q"+base+".analysis.\\E)|(\\Q"+project+".\\E))([\\p{L}_$][\\p{L}\\p{N}_$]+?)(TokenFilter|Filter|Tokenizer|CharFilter)Factory"); + /** * This method loads a class either with it's FQN or a short-name (solr.class-simplename or class-simplename). * It tries to load the class with the name that is given first and if it fails, it tries all the known @@ -394,6 +406,32 @@ } } Class clazz = null; + + // first try legacy analysis patterns, now replaced by Lucene's Analysis package: + final Matcher m = legacyAnalysisPattern.matcher(cname); + if (m.matches()) { + log.trace("Trying to load class from analysis SPI"); + // retrieve the map of classLoader -> expectedType -> SPI from cache / regenerate cache + Map,AnalysisSPILoader> spiLoaders = expectedTypesSPILoaders.get(classLoader); + if (spiLoaders == null) { + spiLoaders = new IdentityHashMap,AnalysisSPILoader>(3); + spiLoaders.put(CharFilterFactory.class, CharFilterFactory.getSPILoader(classLoader)); + spiLoaders.put(TokenizerFactory.class, TokenizerFactory.getSPILoader(classLoader)); + spiLoaders.put(TokenFilterFactory.class, TokenFilterFactory.getSPILoader(classLoader)); + expectedTypesSPILoaders.put(classLoader, spiLoaders); + } + @SuppressWarnings("unchecked") final AnalysisSPILoader loader = + (AnalysisSPILoader) spiLoaders.get(expectedType); + if (loader != null) { + // it's a correct expected type for analysis! Let's go on! + try { + return clazz = loader.lookupClass(m.group(4)); + } catch (IllegalArgumentException ex) { + // ok, we fall back to legacy loading + } + } + } + // first try cname == full name try { return Class.forName(cname, true, classLoader).asSubclass(expectedType); Index: solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java =================================================================== --- solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/schema/FieldTypePluginLoader.java (working copy) @@ -19,9 +19,9 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.analysis.core.KeywordTokenizerFactory; import org.apache.lucene.analysis.util.*; import org.apache.lucene.util.Version; -import org.apache.solr.analysis.KeywordTokenizerFactory; import org.apache.solr.analysis.TokenizerChain; import org.apache.solr.common.SolrException; import org.apache.solr.util.DOMUtil; Index: solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java =================================================================== --- solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java (revision 1365483) +++ solr/core/src/java/org/apache/solr/search/ExtendedDismaxQParserPlugin.java (working copy) @@ -27,6 +27,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.StopFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.queries.function.BoostedQuery; import org.apache.lucene.queries.function.FunctionQuery; @@ -36,7 +37,6 @@ import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.*; -import org.apache.solr.analysis.StopFilterFactory; import org.apache.solr.analysis.TokenizerChain; import org.apache.solr.search.SolrQueryParser.MagicFieldName; import org.apache.solr.common.params.DisMaxParams; Index: solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/CommonGramsFilterFactoryTest.java (working copy) @@ -1,106 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; -import java.util.HashMap; - -/** - * Tests pretty much copied from StopFilterFactoryTest We use the test files - * used by the StopFilterFactoryTest TODO: consider creating separate test files - * so this won't break if stop filter test files change - **/ -public class CommonGramsFilterFactoryTest extends BaseTokenStreamTestCase { - - public void testInform() throws Exception { - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - assertTrue("loader is null and it shouldn't be", loader != null); - CommonGramsFilterFactory factory = new CommonGramsFilterFactory(); - Map args = new HashMap(); - args.put("words", "stop-1.txt"); - args.put("ignoreCase", "true"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - CharArraySet words = factory.getCommonWords(); - assertTrue("words is null and it shouldn't be", words != null); - assertTrue("words Size: " + words.size() + " is not: " + 2, - words.size() == 2); - assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory - .isIgnoreCase() == true); - - factory = new CommonGramsFilterFactory(); - args.put("words", "stop-1.txt, stop-2.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - words = factory.getCommonWords(); - assertTrue("words is null and it shouldn't be", words != null); - assertTrue("words Size: " + words.size() + " is not: " + 4, - words.size() == 4); - assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory - .isIgnoreCase() == true); - - factory = new CommonGramsFilterFactory(); - args.put("words", "stop-snowball.txt"); - args.put("format", "snowball"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - words = factory.getCommonWords(); - assertEquals(8, words.size()); - assertTrue(words.contains("he")); - assertTrue(words.contains("him")); - assertTrue(words.contains("his")); - assertTrue(words.contains("himself")); - assertTrue(words.contains("she")); - assertTrue(words.contains("her")); - assertTrue(words.contains("hers")); - assertTrue(words.contains("herself")); - } - - /** - * If no words are provided, then a set of english default stopwords is used. - */ - public void testDefaults() throws Exception { - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - assertTrue("loader is null and it shouldn't be", loader != null); - CommonGramsFilterFactory factory = new CommonGramsFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - factory.inform(loader); - CharArraySet words = factory.getCommonWords(); - assertTrue("words is null and it shouldn't be", words != null); - assertTrue(words.contains("the")); - Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, - new String[] { "testing", "testing_the", "the", "the_factory", "factory" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/CommonGramsQueryFilterFactoryTest.java (working copy) @@ -1,105 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; -import java.util.HashMap; - -/** - * Tests pretty much copied from StopFilterFactoryTest We use the test files - * used by the StopFilterFactoryTest TODO: consider creating separate test files - * so this won't break if stop filter test files change - **/ -public class CommonGramsQueryFilterFactoryTest extends BaseTokenStreamTestCase { - - public void testInform() throws Exception { - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - assertTrue("loader is null and it shouldn't be", loader != null); - CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); - Map args = new HashMap(); - args.put("words", "stop-1.txt"); - args.put("ignoreCase", "true"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - CharArraySet words = factory.getCommonWords(); - assertTrue("words is null and it shouldn't be", words != null); - assertTrue("words Size: " + words.size() + " is not: " + 2, - words.size() == 2); - assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory - .isIgnoreCase() == true); - - factory = new CommonGramsQueryFilterFactory(); - args.put("words", "stop-1.txt, stop-2.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - words = factory.getCommonWords(); - assertTrue("words is null and it shouldn't be", words != null); - assertTrue("words Size: " + words.size() + " is not: " + 4, - words.size() == 4); - assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory - .isIgnoreCase() == true); - - factory = new CommonGramsQueryFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - args.put("words", "stop-snowball.txt"); - args.put("format", "snowball"); - factory.init(args); - factory.inform(loader); - words = factory.getCommonWords(); - assertEquals(8, words.size()); - assertTrue(words.contains("he")); - assertTrue(words.contains("him")); - assertTrue(words.contains("his")); - assertTrue(words.contains("himself")); - assertTrue(words.contains("she")); - assertTrue(words.contains("her")); - assertTrue(words.contains("hers")); - assertTrue(words.contains("herself")); - } - - /** - * If no words are provided, then a set of english default stopwords is used. - */ - public void testDefaults() throws Exception { - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - assertTrue("loader is null and it shouldn't be", loader != null); - CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - factory.inform(loader); - CharArraySet words = factory.getCommonWords(); - assertTrue("words is null and it shouldn't be", words != null); - assertTrue(words.contains("the")); - Tokenizer tokenizer = new MockTokenizer(new StringReader("testing the factory"), MockTokenizer.WHITESPACE, false); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, - new String[] { "testing_the", "the_factory" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java (working copy) @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.solr.analysis; - -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; - -public class DoubleMetaphoneFilterFactoryTest extends BaseTokenStreamTestCase { - - public void testDefaults() throws Exception { - DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); - factory.init(new HashMap()); - TokenStream inputStream = new MockTokenizer(new StringReader("international"), MockTokenizer.WHITESPACE, false); - - TokenStream filteredStream = factory.create(inputStream); - assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); - assertTokenStreamContents(filteredStream, new String[] { "international", "ANTR" }); - } - - public void testSettingSizeAndInject() throws Exception { - DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); - Map parameters = new HashMap(); - parameters.put("inject", "false"); - parameters.put("maxCodeLength", "8"); - factory.init(parameters); - - TokenStream inputStream = new MockTokenizer(new StringReader("international"), MockTokenizer.WHITESPACE, false); - - TokenStream filteredStream = factory.create(inputStream); - assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); - assertTokenStreamContents(filteredStream, new String[] { "ANTRNXNL" }); - } - - /** - * Ensure that reset() removes any state (buffered tokens) - */ - public void testReset() throws Exception { - DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); - factory.init(new HashMap()); - TokenStream inputStream = new MockTokenizer(new StringReader("international"), MockTokenizer.WHITESPACE, false); - - TokenStream filteredStream = factory.create(inputStream); - CharTermAttribute termAtt = filteredStream.addAttribute(CharTermAttribute.class); - assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); - - filteredStream.reset(); - assertTrue(filteredStream.incrementToken()); - assertEquals(13, termAtt.length()); - assertEquals("international", termAtt.toString()); - filteredStream.reset(); - - // ensure there are no more tokens, such as ANTRNXNL - assertFalse(filteredStream.incrementToken()); - } -} Index: solr/core/src/test/org/apache/solr/analysis/LengthFilterTest.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/LengthFilterTest.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/LengthFilterTest.java (working copy) @@ -1,50 +0,0 @@ -package org.apache.solr.analysis; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -public class LengthFilterTest extends BaseTokenStreamTestCase { - - public void test() throws IOException { - LengthFilterFactory factory = new LengthFilterFactory(); - Map args = new HashMap(); - args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); - args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); - // default: args.put("enablePositionIncrements", "false"); - factory.init(args); - String test = "foo foobar super-duper-trooper"; - TokenStream stream = factory.create(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 1 }); - - factory = new LengthFilterFactory(); - args = new HashMap(); - args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); - args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); - args.put("enablePositionIncrements", "true"); - factory.init(args); - stream = factory.create(new MockTokenizer(new StringReader(test), MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "foobar" }, new int[] { 2 }); - } -} \ No newline at end of file Index: solr/core/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/SnowballPorterFilterFactoryTest.java (working copy) @@ -1,101 +0,0 @@ -package org.apache.solr.analysis; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.common.util.StrUtils; -import org.apache.solr.core.SolrResourceLoader; -import org.tartarus.snowball.ext.EnglishStemmer; - -import java.io.IOException; -import java.io.InputStream; -import java.io.Reader; -import java.io.StringReader; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.ArrayList; - -public class SnowballPorterFilterFactoryTest extends BaseTokenStreamTestCase { - - public void test() throws IOException { - EnglishStemmer stemmer = new EnglishStemmer(); - String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"}; - String[] gold = new String[test.length]; - for (int i = 0; i < test.length; i++) { - stemmer.setCurrent(test[i]); - stemmer.stem(); - gold[i] = stemmer.getCurrent(); - } - - SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory(); - Map args = new HashMap(); - args.put("language", "English"); - - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(new LinesMockSolrResourceLoader(new ArrayList())); - Tokenizer tokenizer = new MockTokenizer( - new StringReader(StrUtils.join(Arrays.asList(test), ' ')), MockTokenizer.WHITESPACE, false); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, gold); - } - - class LinesMockSolrResourceLoader implements ResourceLoader { - List lines; - - LinesMockSolrResourceLoader(List lines) { - this.lines = lines; - } - - public List getLines(String resource) throws IOException { - return lines; - } - - public T newInstance(String cname, Class expectedType, String... subpackages) { - return null; - } - - public InputStream openResource(String resource) throws IOException { - return null; - } - } - - /** - * Test the protected words mechanism of SnowballPorterFilterFactory - */ - public void testProtected() throws Exception { - SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory(); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - Map args = new HashMap(); - args.put("protected", "protwords.txt"); - args.put("language", "English"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - Reader reader = new StringReader("ridding of some stemming"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" }); - } -} - Index: solr/core/src/test/org/apache/solr/analysis/TestArabicFilters.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestArabicFilters.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestArabicFilters.java (working copy) @@ -1,83 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the Arabic filter Factories are working. - */ -public class TestArabicFilters extends BaseTokenStreamTestCase { - - /** - * Test ArabicNormalizationFilterFactory - */ - public void testNormalizer() throws Exception { - Reader reader = new StringReader("الذين مَلكت أيمانكم"); - StandardTokenizerFactory factory = new StandardTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory(); - filterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - filterFactory.init(args); - Tokenizer tokenizer = factory.create(reader); - TokenStream stream = filterFactory.create(tokenizer); - assertTokenStreamContents(stream, new String[] {"الذين", "ملكت", "ايمانكم"}); - } - - /** - * Test ArabicStemFilterFactory - */ - public void testStemmer() throws Exception { - Reader reader = new StringReader("الذين مَلكت أيمانكم"); - StandardTokenizerFactory factory = new StandardTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory(); - normFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory(); - Map args = Collections.emptyMap(); - factory.init(args); - normFactory.init(args); - Tokenizer tokenizer = factory.create(reader); - TokenStream stream = normFactory.create(tokenizer); - stream = stemFactory.create(stream); - assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"}); - } - - /** - * Test PersianCharFilterFactory - */ - public void testPersianCharFilter() throws Exception { - Reader reader = new StringReader("می‌خورد"); - PersianCharFilterFactory charfilterFactory = new PersianCharFilterFactory(); - StandardTokenizerFactory tokenizerFactory = new StandardTokenizerFactory(); - tokenizerFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - tokenizerFactory.init(args); - TokenStream stream = tokenizerFactory.create(charfilterFactory.create(reader)); - assertTokenStreamContents(stream, new String[] { "می", "خورد" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestBeiderMorseFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestBeiderMorseFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestBeiderMorseFilterFactory.java (working copy) @@ -1,70 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.StringReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** Simple tests for {@link BeiderMorseFilterFactory} */ -public class TestBeiderMorseFilterFactory extends BaseTokenStreamTestCase { - public void testBasics() throws Exception { - BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(ts, - new String[] { "vDnbirk", "vanbirk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" }, - new int[] { 0, 0, 0, 0, 0, 0 }, - new int[] { 8, 8, 8, 8, 8, 8 }, - new int[] { 1, 0, 0, 0, 0, 0 }); - } - - public void testLanguageSet() throws Exception { - BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(); - Map args = new HashMap(); - args.put("languageSet", "polish"); - factory.init(args); - TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(ts, - new String[] { "vDmbYrk", "vDmbirk", "vambYrk", "vambirk", "vimbYrk", "vimbirk" }, - new int[] { 0, 0, 0, 0, 0, 0 }, - new int[] { 8, 8, 8, 8, 8, 8 }, - new int[] { 1, 0, 0, 0, 0, 0 }); - } - - public void testOptions() throws Exception { - BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(); - Map args = new HashMap(); - args.put("nameType", "ASHKENAZI"); - args.put("ruleType", "EXACT"); - factory.init(args); - TokenStream ts = factory.create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(ts, - new String[] { "vajnberk" }, - new int[] { 0 }, - new int[] { 8 }, - new int[] { 1 }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestBrazilianStemFilterFactory.java (working copy) @@ -1,42 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the Brazilian stem filter factory is working. - */ -public class TestBrazilianStemFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually stems and normalizes text. - */ - public void testStemming() throws Exception { - Reader reader = new StringReader("Brasília"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory(); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "brasil" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestBulgarianStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestBulgarianStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestBulgarianStemFilterFactory.java (working copy) @@ -1,42 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the Bulgarian stem filter factory is working. - */ -public class TestBulgarianStemFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually stems text. - */ - public void testStemming() throws Exception { - Reader reader = new StringReader("компютри"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - BulgarianStemFilterFactory factory = new BulgarianStemFilterFactory(); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "компютр" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java (working copy) @@ -1,224 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - - -/** - * - */ -public class TestCapitalizationFilterFactory extends BaseTokenStreamTestCase { - - public void testCapitalization() throws Exception - { - Map args = new HashMap(); - args.put( CapitalizationFilterFactory.KEEP, "and the it BIG" ); - args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" ); - - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init( args ); - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("kiTTEN"), MockTokenizer.WHITESPACE, false)), - new String[] { "Kitten" }); - - factory.forceFirstLetter = true; - - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("and"), MockTokenizer.WHITESPACE, false)), - new String[] { "And" }); - - //first is forced, but it's not a keep word, either - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("AnD"), MockTokenizer.WHITESPACE, false)), - new String[] { "And" }); - - factory.forceFirstLetter = false; - - //first is not forced, but it's not a keep word, either - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("AnD"), MockTokenizer.WHITESPACE, false)), - new String[] { "And" }); - - factory.forceFirstLetter = true; - - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("big"), MockTokenizer.WHITESPACE, false)), - new String[] { "Big" }); - - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("BIG"), MockTokenizer.WHITESPACE, false)), - new String[] { "BIG" }); - - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("Hello thEre my Name is Ryan"), MockTokenizer.KEYWORD, false)), - new String[] { "Hello there my name is ryan" }); - - // now each token - factory.onlyFirstWord = false; - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("Hello thEre my Name is Ryan"), MockTokenizer.WHITESPACE, false)), - new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }); - - // now only the long words - factory.minWordLength = 3; - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("Hello thEre my Name is Ryan"), MockTokenizer.WHITESPACE, false)), - new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }); - - // without prefix - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("McKinley"), MockTokenizer.WHITESPACE, false)), - new String[] { "Mckinley" }); - - // Now try some prefixes - factory = new CapitalizationFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - args.put( "okPrefix", "McK" ); // all words - factory.init( args ); - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("McKinley"), MockTokenizer.WHITESPACE, false)), - new String[] { "McKinley" }); - - // now try some stuff with numbers - factory.forceFirstLetter = false; - factory.onlyFirstWord = false; - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("1st 2nd third"), MockTokenizer.WHITESPACE, false)), - new String[] { "1st", "2nd", "Third" }); - - factory.forceFirstLetter = true; - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("the The the"), MockTokenizer.KEYWORD, false)), - new String[] { "The The the" }); - } - - public void testKeepIgnoreCase() throws Exception { - Map args = new HashMap(); - args.put( CapitalizationFilterFactory.KEEP, "kitten" ); - args.put( CapitalizationFilterFactory.KEEP_IGNORE_CASE, "true" ); - args.put( CapitalizationFilterFactory.ONLY_FIRST_WORD, "true" ); - - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init( args ); - factory.forceFirstLetter = true; - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("kiTTEN"), MockTokenizer.KEYWORD, false)), - new String[] { "KiTTEN" }); - - factory.forceFirstLetter = false; - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("kiTTEN"), MockTokenizer.KEYWORD, false)), - new String[] { "kiTTEN" }); - - factory.keep = null; - assertTokenStreamContents(factory.create( - new MockTokenizer(new StringReader("kiTTEN"), MockTokenizer.KEYWORD, false)), - new String[] { "Kitten" }); - } - - /** - * Test CapitalizationFilterFactory's minWordLength option. - * - * This is very weird when combined with ONLY_FIRST_WORD!!! - */ - public void testMinWordLength() throws Exception { - Map args = new HashMap(); - args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true"); - args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5"); - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - Tokenizer tokenizer = new MockTokenizer(new StringReader( - "helo testing"), MockTokenizer.WHITESPACE, false); - TokenStream ts = factory.create(tokenizer); - assertTokenStreamContents(ts, new String[] {"helo", "Testing"}); - } - - /** - * Test CapitalizationFilterFactory's maxWordCount option with only words of 1 - * in each token (it should do nothing) - */ - public void testMaxWordCount() throws Exception { - Map args = new HashMap(); - args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2"); - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - Tokenizer tokenizer = new MockTokenizer(new StringReader( - "one two three four"), MockTokenizer.WHITESPACE, false); - TokenStream ts = factory.create(tokenizer); - assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"}); - } - - /** - * Test CapitalizationFilterFactory's maxWordCount option when exceeded - */ - public void testMaxWordCount2() throws Exception { - Map args = new HashMap(); - args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2"); - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - Tokenizer tokenizer = new MockTokenizer(new StringReader( - "one two three four"), MockTokenizer.KEYWORD, false); - TokenStream ts = factory.create(tokenizer); - assertTokenStreamContents(ts, new String[] {"one two three four"}); - } - - /** - * Test CapitalizationFilterFactory's maxTokenLength option when exceeded - * - * This is weird, it is not really a max, but inclusive (look at 'is') - */ - public void testMaxTokenLength() throws Exception { - Map args = new HashMap(); - args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2"); - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - Tokenizer tokenizer = new MockTokenizer(new StringReader( - "this is a test"), MockTokenizer.WHITESPACE, false); - TokenStream ts = factory.create(tokenizer); - assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"}); - } - - /** - * Test CapitalizationFilterFactory's forceFirstLetter option - */ - public void testForceFirstLetter() throws Exception { - Map args = new HashMap(); - args.put(CapitalizationFilterFactory.KEEP, "kitten"); - args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true"); - CapitalizationFilterFactory factory = new CapitalizationFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - Tokenizer tokenizer = new MockTokenizer(new StringReader("kitten"), MockTokenizer.WHITESPACE, false); - TokenStream ts = factory.create(tokenizer); - assertTokenStreamContents(ts, new String[] {"Kitten"}); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestCJKBigramFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestCJKBigramFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestCJKBigramFilterFactory.java (working copy) @@ -1,56 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardTokenizer; - -/** - * Simple tests to ensure the CJK bigram factory is working. - * @deprecated - */ -public class TestCJKBigramFilterFactory extends BaseTokenStreamTestCase { - public void testDefaults() throws Exception { - Reader reader = new StringReader("多くの学生が試験に落ちた。"); - CJKBigramFilterFactory factory = new CJKBigramFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader)); - assertTokenStreamContents(stream, - new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" }); - } - - public void testHanOnly() throws Exception { - Reader reader = new StringReader("多くの学生が試験に落ちた。"); - CJKBigramFilterFactory factory = new CJKBigramFilterFactory(); - Map args = new HashMap(); - args.put("hiragana", "false"); - factory.init(args); - TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader)); - assertTokenStreamContents(stream, - new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestCJKWidthFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestCJKWidthFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestCJKWidthFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the CJKWidthFilterFactory is working - */ -public class TestCJKWidthFilterFactory extends BaseTokenStreamTestCase { - public void test() throws Exception { - Reader reader = new StringReader("Test 1234"); - CJKWidthFilterFactory factory = new CJKWidthFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "Test", "1234" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestCzechStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestCzechStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestCzechStemFilterFactory.java (working copy) @@ -1,42 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the Czech stem filter factory is working. - */ -public class TestCzechStemFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually stems text. - */ - public void testStemming() throws Exception { - Reader reader = new StringReader("angličtí"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - CzechStemFilterFactory factory = new CzechStemFilterFactory(); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "anglick" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestDelimitedPayloadTokenFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestDelimitedPayloadTokenFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestDelimitedPayloadTokenFilterFactory.java (working copy) @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter; -import org.apache.lucene.analysis.payloads.FloatEncoder; -import org.apache.lucene.analysis.payloads.PayloadHelper; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -public class TestDelimitedPayloadTokenFilterFactory extends BaseTokenStreamTestCase { - - public void testEncoder() throws Exception { - Map args = new HashMap(); - args.put(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR, "float"); - DelimitedPayloadTokenFilterFactory factory = new DelimitedPayloadTokenFilterFactory(); - factory.init(args); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - factory.inform(loader); - - TokenStream input = new MockTokenizer(new StringReader("the|0.1 quick|0.1 red|0.1"), MockTokenizer.WHITESPACE, false); - DelimitedPayloadTokenFilter tf = factory.create(input); - tf.reset(); - while (tf.incrementToken()){ - PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class); - assertTrue("payAttr is null and it shouldn't be", payAttr != null); - byte[] payData = payAttr.getPayload().bytes; - assertTrue("payData is null and it shouldn't be", payData != null); - assertTrue("payData is null and it shouldn't be", payData != null); - float payFloat = PayloadHelper.decodeFloat(payData); - assertTrue(payFloat + " does not equal: " + 0.1f, payFloat == 0.1f); - } - } - - public void testDelim() throws Exception { - Map args = new HashMap(); - args.put(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR, FloatEncoder.class.getName()); - args.put(DelimitedPayloadTokenFilterFactory.DELIMITER_ATTR, "*"); - DelimitedPayloadTokenFilterFactory factory = new DelimitedPayloadTokenFilterFactory(); - factory.init(args); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - factory.inform(loader); - - TokenStream input = new MockTokenizer(new StringReader("the*0.1 quick*0.1 red*0.1"), MockTokenizer.WHITESPACE, false); - DelimitedPayloadTokenFilter tf = factory.create(input); - tf.reset(); - while (tf.incrementToken()){ - PayloadAttribute payAttr = tf.getAttribute(PayloadAttribute.class); - assertTrue("payAttr is null and it shouldn't be", payAttr != null); - byte[] payData = payAttr.getPayload().bytes; - assertTrue("payData is null and it shouldn't be", payData != null); - float payFloat = PayloadHelper.decodeFloat(payData); - assertTrue(payFloat + " does not equal: " + 0.1f, payFloat == 0.1f); - } - } -} - Index: solr/core/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestDictionaryCompoundWordTokenFilterFactory.java (working copy) @@ -1,54 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -/** - * Simple tests to ensure the Dictionary compound filter factory is working. - */ -public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually decompounds text. - */ - public void testDecompounding() throws Exception { - Reader reader = new StringReader("I like to play softball"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory(); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - Map args = new HashMap(); - args.put("dictionary", "compoundDictionary.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, - new String[] { "I", "like", "to", "play", "softball", "soft", "ball" }); - } - -} Index: solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestElisionFilterFactory.java (working copy) @@ -1,88 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -/** - * Simple tests to ensure the French elision filter factory is working. - */ -public class TestElisionFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually normalizes text. - */ - public void testElision() throws Exception { - Reader reader = new StringReader("l'avion"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - ElisionFilterFactory factory = new ElisionFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - Map args = new HashMap(); - args.put("articles", "frenchArticles.txt"); - factory.init(args); - factory.inform(loader); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "avion" }); - } - - /** - * Test creating an elision filter without specifying any articles - */ - public void testDefaultArticles() throws Exception { - Reader reader = new StringReader("l'avion"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - ElisionFilterFactory factory = new ElisionFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - factory.inform(loader); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "avion" }); - } - - /** - * Test setting ignoreCase=true - */ - public void testCaseInsensitive() throws Exception { - Reader reader = new StringReader("L'avion"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - ElisionFilterFactory factory = new ElisionFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - Map args = new HashMap(); - args.put("articles", "frenchArticles.txt"); - args.put("ignoreCase", "true"); - factory.init(args); - factory.inform(loader); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "avion" }); - } - -} Index: solr/core/src/test/org/apache/solr/analysis/TestEnglishMinimalStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestEnglishMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestEnglishMinimalStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the English minimal stem factory is working. - */ -public class TestEnglishMinimalStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("bricks"); - EnglishMinimalStemFilterFactory factory = new EnglishMinimalStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "brick" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestFinnishLightStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestFinnishLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestFinnishLightStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Finnish light stem factory is working. - */ -public class TestFinnishLightStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("aseistettujen"); - FinnishLightStemFilterFactory factory = new FinnishLightStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "aseistet" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestFrenchLightStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestFrenchLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestFrenchLightStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the French light stem factory is working. - */ -public class TestFrenchLightStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("administrativement"); - FrenchLightStemFilterFactory factory = new FrenchLightStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "administratif" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestFrenchMinimalStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestFrenchMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestFrenchMinimalStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the French minimal stem factory is working. - */ -public class TestFrenchMinimalStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("chevaux"); - FrenchMinimalStemFilterFactory factory = new FrenchMinimalStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "cheval" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestGalicianMinimalStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Galician plural stem factory is working. - */ -public class TestGalicianMinimalStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("elefantes"); - GalicianMinimalStemFilterFactory factory = new GalicianMinimalStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "elefante" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestGalicianStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestGalicianStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestGalicianStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Galician stem factory is working. - */ -public class TestGalicianStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("cariñosa"); - GalicianStemFilterFactory factory = new GalicianStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "cariñ" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestGermanLightStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestGermanLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestGermanLightStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the German light stem factory is working. - */ -public class TestGermanLightStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("häuser"); - GermanLightStemFilterFactory factory = new GermanLightStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "haus" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestGermanMinimalStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestGermanMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestGermanMinimalStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the German minimal stem factory is working. - */ -public class TestGermanMinimalStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("bilder"); - GermanMinimalStemFilterFactory factory = new GermanMinimalStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "bild" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestGermanNormalizationFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the German normalization factory is working. - */ -public class TestGermanNormalizationFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("weißbier"); - GermanNormalizationFilterFactory factory = new GermanNormalizationFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "weissbier" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestGermanStemFilterFactory.java (working copy) @@ -1,42 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the German stem filter factory is working. - */ -public class TestGermanStemFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually stems text. - */ - public void testStemming() throws Exception { - Reader reader = new StringReader("Tischen"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - GermanStemFilterFactory factory = new GermanStemFilterFactory(); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "tisch" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestGreekLowerCaseFilterFactory.java (working copy) @@ -1,47 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the Greek lowercase filter factory is working. - */ -public class TestGreekLowerCaseFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually lowercases (and a bit more) greek text. - */ - public void testNormalization() throws Exception { - Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestGreekStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestGreekStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestGreekStemFilterFactory.java (working copy) @@ -1,41 +0,0 @@ -package org.apache.solr.analysis; - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.el.GreekLowerCaseFilter; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Simple tests to ensure the Greek stem filter factory is working. - */ -public class TestGreekStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("άνθρωπος"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - TokenStream normalized = new GreekLowerCaseFilter(TEST_VERSION_CURRENT, tokenizer); - GreekStemFilterFactory factory = new GreekStemFilterFactory(); - TokenStream stream = factory.create(normalized); - assertTokenStreamContents(stream, new String[] { "ανθρωπ" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestHindiFilters.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestHindiFilters.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestHindiFilters.java (working copy) @@ -1,89 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the Hindi filter Factories are working. - */ -public class TestHindiFilters extends BaseTokenStreamTestCase { - /** - * Test IndicNormalizationFilterFactory - */ - public void testIndicNormalizer() throws Exception { - Reader reader = new StringReader("ত্‍ अाैर"); - StandardTokenizerFactory factory = new StandardTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory(); - filterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - filterFactory.init(args); - Tokenizer tokenizer = factory.create(reader); - TokenStream stream = filterFactory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "ৎ", "और" }); - } - - /** - * Test HindiNormalizationFilterFactory - */ - public void testHindiNormalizer() throws Exception { - Reader reader = new StringReader("क़िताब"); - StandardTokenizerFactory factory = new StandardTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory(); - HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory(); - hindiFilterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - hindiFilterFactory.init(args); - Tokenizer tokenizer = factory.create(reader); - TokenStream stream = indicFilterFactory.create(tokenizer); - stream = hindiFilterFactory.create(stream); - assertTokenStreamContents(stream, new String[] {"किताब"}); - } - - /** - * Test HindiStemFilterFactory - */ - public void testStemmer() throws Exception { - Reader reader = new StringReader("किताबें"); - StandardTokenizerFactory factory = new StandardTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory(); - HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory(); - HindiStemFilterFactory stemFactory = new HindiStemFilterFactory(); - stemFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - stemFactory.init(args); - Tokenizer tokenizer = factory.create(reader); - TokenStream stream = indicFilterFactory.create(tokenizer); - stream = hindiFilterFactory.create(stream); - stream = stemFactory.create(stream); - assertTokenStreamContents(stream, new String[] {"किताब"}); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java (working copy) @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.IOException; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.*; - -/** - * Simple tests to ensure this factory is working - */ -public class TestHTMLStripCharFilterFactory extends BaseTokenStreamTestCase { - - - public void testNothingChanged() throws IOException { - // 11111111112 - // 012345678901234567890 - final String text = "this is only a test."; - HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); - Map args = new HashMap(); - args.put("escapedTags", "a, Title"); - factory.init(args); - CharFilter cs = factory.create(new StringReader(text)); - TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); - assertTokenStreamContents(ts, - new String[] { "this", "is", "only", "a", "test." }, - new int[] { 0, 5, 8, 13, 15 }, - new int[] { 4, 7, 12, 14, 20 }); - } - - public void testNoEscapedTags() throws IOException { - // 11111111112222222222333333333344 - // 012345678901234567890123456789012345678901 - final String text = "this is only a test."; - HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); - Map args = new HashMap(); - factory.init(args); - CharFilter cs = factory.create(new StringReader(text)); - TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); - assertTokenStreamContents(ts, - new String[] { "this", "is", "only", "a", "test." }, - new int[] { 3, 12, 18, 27, 32 }, - new int[] { 11, 14, 26, 28, 41 }); - } - - public void testEscapedTags() throws IOException { - // 11111111112222222222333333333344 - // 012345678901234567890123456789012345678901 - final String text = "this is only a test."; - HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); - Map args = new HashMap(); - args.put("escapedTags", "U i"); - factory.init(args); - CharFilter cs = factory.create(new StringReader(text)); - TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); - assertTokenStreamContents(ts, - new String[] { "this", "is", "only", "a", "test." }, - new int[] { 0, 12, 18, 27, 29 }, - new int[] { 11, 14, 26, 28, 41 }); - } - - public void testSeparatorOnlyEscapedTags() throws IOException { - // 11111111112222222222333333333344 - // 012345678901234567890123456789012345678901 - final String text = "this is only a test."; - HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); - Map args = new HashMap(); - args.put("escapedTags", ",, , "); - factory.init(args); - CharFilter cs = factory.create(new StringReader(text)); - TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); - assertTokenStreamContents(ts, - new String[] { "this", "is", "only", "a", "test." }, - new int[] { 3, 12, 18, 27, 32 }, - new int[] { 11, 14, 26, 28, 41 }); - } - - public void testEmptyEscapedTags() throws IOException { - // 11111111112222222222333333333344 - // 012345678901234567890123456789012345678901 - final String text = "this is only a test."; - HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); - Map args = new HashMap(); - args.put("escapedTags", ""); - factory.init(args); - CharFilter cs = factory.create(new StringReader(text)); - TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); - assertTokenStreamContents(ts, - new String[] { "this", "is", "only", "a", "test." }, - new int[] { 3, 12, 18, 27, 32 }, - new int[] { 11, 14, 26, 28, 41 }); - } - - public void testSingleEscapedTag() throws IOException { - // 11111111112222222222333333333344 - // 012345678901234567890123456789012345678901 - final String text = "this is only a test."; - HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory(); - Map args = new HashMap(); - args.put("escapedTags", ", B\r\n\t"); - factory.init(args); - CharFilter cs = factory.create(new StringReader(text)); - TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); - assertTokenStreamContents(ts, - new String[] { "this", "is", "only", "a", "test." }, - new int[] { 3, 12, 15, 27, 32 }, - new int[] { 11, 14, 26, 28, 41 }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestHungarianLightStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestHungarianLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestHungarianLightStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Hungarian light stem factory is working. - */ -public class TestHungarianLightStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("házakat"); - HungarianLightStemFilterFactory factory = new HungarianLightStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "haz" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestHunspellStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestHunspellStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestHunspellStemFilterFactory.java (working copy) @@ -1,48 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.solr.core.SolrResourceLoader; -import org.apache.solr.schema.IndexSchema; - -/** - * Simple tests to ensure the Hunspell stemmer loads from factory - */ -public class TestHunspellStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - HunspellStemFilterFactory factory = new HunspellStemFilterFactory(); - Map args = new HashMap(); - args.put("dictionary", "hunspell-test.dic"); - args.put("affix", "hunspell-test.aff"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(new SolrResourceLoader("solr/collection1")); - - Reader reader = new StringReader("abc"); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "ab" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestHyphenationCompoundWordTokenFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestHyphenationCompoundWordTokenFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestHyphenationCompoundWordTokenFilterFactory.java (working copy) @@ -1,81 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -/** - * Simple tests to ensure the Hyphenation compound filter factory is working. - */ -public class TestHyphenationCompoundWordTokenFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the factory works with hyphenation grammar+dictionary: using default options. - */ - public void testHyphenationWithDictionary() throws Exception { - Reader reader = new StringReader("min veninde som er lidt af en læsehest"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory(); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - Map args = new HashMap(); - args.put("hyphenator", "da_UTF8.xml"); - args.put("dictionary", "da_compoundDictionary.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - TokenStream stream = factory.create(tokenizer); - - assertTokenStreamContents(stream, - new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, - new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 } - ); - } - - /** - * Ensure the factory works with no dictionary: using hyphenation grammar only. - * Also change the min/max subword sizes from the default. When using no dictionary, - * its generally necessary to tweak these, or you get lots of expansions. - */ - public void testHyphenationOnly() throws Exception { - Reader reader = new StringReader("basketballkurv"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory(); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - Map args = new HashMap(); - args.put("hyphenator", "da_UTF8.xml"); - args.put("minSubwordSize", "2"); - args.put("maxSubwordSize", "4"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - TokenStream stream = factory.create(tokenizer); - - assertTokenStreamContents(stream, - new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" } - ); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestIndonesianStemFilterFactory.java (working copy) @@ -1,60 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the Indonesian stem filter factory is working. - */ -public class TestIndonesianStemFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually stems text. - */ - public void testStemming() throws Exception { - Reader reader = new StringReader("dibukukannya"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory(); - Map args = new HashMap(); - factory.init(args); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "buku" }); - } - - /** - * Test inflectional-only mode - */ - public void testStemmingInflectional() throws Exception { - Reader reader = new StringReader("dibukukannya"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - IndonesianStemFilterFactory factory = new IndonesianStemFilterFactory(); - Map args = new HashMap(); - args.put("stemDerivational", "false"); - factory.init(args); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "dibukukan" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestIrishLowerCaseFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestIrishLowerCaseFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestIrishLowerCaseFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Irish lowercase filter factory is working. - */ -public class TestIrishLowerCaseFilterFactory extends BaseTokenStreamTestCase { - public void testCasing() throws Exception { - Reader reader = new StringReader("nAthair tUISCE hARD"); - IrishLowerCaseFilterFactory factory = new IrishLowerCaseFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "n-athair", "t-uisce", "hard" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestItalianLightStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestItalianLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestItalianLightStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Italian light stem factory is working. - */ -public class TestItalianLightStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("ragazzo ragazzi"); - ItalianLightStemFilterFactory factory = new ItalianLightStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "ragazz", "ragazz" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestJapaneseBaseFormFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestJapaneseBaseFormFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestJapaneseBaseFormFilterFactory.java (working copy) @@ -1,46 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.solr.core.SolrResourceLoader; - -/** - * Simple tests for {@link JapaneseBaseFormFilterFactory} - */ -public class TestJapaneseBaseFormFilterFactory extends BaseTokenStreamTestCase { - public void testBasics() throws IOException { - JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); - tokenizerFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - tokenizerFactory.init(args); - tokenizerFactory.inform(new SolrResourceLoader(null, null)); - TokenStream ts = tokenizerFactory.create(new StringReader("それはまだ実験段階にあります")); - JapaneseBaseFormFilterFactory factory = new JapaneseBaseFormFilterFactory(); - ts = factory.create(ts); - assertTokenStreamContents(ts, - new String[] { "それ", "は", "まだ", "実験", "段階", "に", "ある", "ます" } - ); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestJapaneseIterationMarkCharFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestJapaneseIterationMarkCharFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestJapaneseIterationMarkCharFilterFactory.java (working copy) @@ -1,99 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.CharFilter; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.solr.core.SolrResourceLoader; - -import java.io.IOException; -import java.io.StringReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -/** - * Simple tests for {@link org.apache.solr.analysis.JapaneseIterationMarkCharFilterFactory} - */ -public class TestJapaneseIterationMarkCharFilterFactory extends BaseTokenStreamTestCase { - - public void testIterationMarksWithKeywordTokenizer() throws IOException { - final String text = "時々馬鹿々々しいところゞゝゝミスヾ"; - JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); - CharFilter filter = filterFactory.create(new StringReader(text)); - TokenStream tokenStream = new MockTokenizer(filter, MockTokenizer.KEYWORD, false); - assertTokenStreamContents(tokenStream, new String[]{"時時馬鹿馬鹿しいところどころミスズ"}); - } - - public void testIterationMarksWithJapaneseTokenizer() throws IOException { - JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); - Map tokenizerArgs = Collections.emptyMap(); - tokenizerFactory.init(tokenizerArgs); - tokenizerFactory.inform(new SolrResourceLoader(null, null)); - - JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); - Map filterArgs = Collections.emptyMap(); - filterFactory.init(filterArgs); - - CharFilter filter = filterFactory.create( - new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") - ); - TokenStream tokenStream = tokenizerFactory.create(filter); - assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところどころ", "ミ", "スズ"}); - } - - public void testKanjiOnlyIterationMarksWithJapaneseTokenizer() throws IOException { - JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); - Map tokenizerArgs = Collections.emptyMap(); - tokenizerFactory.init(tokenizerArgs); - tokenizerFactory.inform(new SolrResourceLoader(null, null)); - - JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); - Map filterArgs = new HashMap(); - filterArgs.put("normalizeKanji", "true"); - filterArgs.put("normalizeKana", "false"); - filterFactory.init(filterArgs); - - CharFilter filter = filterFactory.create( - new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") - ); - TokenStream tokenStream = tokenizerFactory.create(filter); - assertTokenStreamContents(tokenStream, new String[]{"時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ"}); - } - - public void testKanaOnlyIterationMarksWithJapaneseTokenizer() throws IOException { - JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); - Map tokenizerArgs = Collections.emptyMap(); - tokenizerFactory.init(tokenizerArgs); - tokenizerFactory.inform(new SolrResourceLoader(null, null)); - - JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(); - Map filterArgs = new HashMap(); - filterArgs.put("normalizeKanji", "false"); - filterArgs.put("normalizeKana", "true"); - filterFactory.init(filterArgs); - - CharFilter filter = filterFactory.create( - new StringReader("時々馬鹿々々しいところゞゝゝミスヾ") - ); - TokenStream tokenStream = tokenizerFactory.create(filter); - assertTokenStreamContents(tokenStream, new String[]{"時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ"}); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestJapaneseKatakanaStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestJapaneseKatakanaStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestJapaneseKatakanaStemFilterFactory.java (working copy) @@ -1,49 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.solr.core.SolrResourceLoader; - -import java.io.IOException; -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; - -/** - * Simple tests for {@link JapaneseKatakanaStemFilterFactory} - */ -public class TestJapaneseKatakanaStemFilterFactory extends BaseTokenStreamTestCase { - public void testKatakanaStemming() throws IOException { - JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); - Map tokenizerArgs = Collections.emptyMap(); - tokenizerFactory.init(tokenizerArgs); - tokenizerFactory.inform(new SolrResourceLoader(null, null)); - TokenStream tokenStream = tokenizerFactory.create( - new StringReader("明後日パーティーに行く予定がある。図書館で資料をコピーしました。") - ); - JapaneseKatakanaStemFilterFactory filterFactory = new JapaneseKatakanaStemFilterFactory(); - Map filterArgs = Collections.emptyMap(); - filterFactory.init(filterArgs); - assertTokenStreamContents(filterFactory.create(tokenStream), - new String[]{ "明後日", "パーティ", "に", "行く", "予定", "が", "ある", // パーティー should be stemmed - "図書館", "で", "資料", "を", "コピー", "し", "まし", "た"} // コピー should not be stemmed - ); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestJapanesePartOfSpeechStopFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestJapanesePartOfSpeechStopFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestJapanesePartOfSpeechStopFilterFactory.java (working copy) @@ -1,56 +0,0 @@ -package org.apache.solr.analysis; - -import java.io.IOException; -import java.io.StringReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.solr.core.SolrResourceLoader; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Simple tests for {@link JapanesePartOfSpeechStopFilterFactory} - */ -public class TestJapanesePartOfSpeechStopFilterFactory extends BaseTokenStreamTestCase { - public void testBasics() throws IOException { - String tags = - "# verb-main:\n" + - "動詞-自立\n"; - - JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); - tokenizerFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map tokenizerArgs = Collections.emptyMap(); - tokenizerFactory.init(tokenizerArgs); - tokenizerFactory.inform(new SolrResourceLoader(null, null)); - TokenStream ts = tokenizerFactory.create(new StringReader("私は制限スピードを超える。")); - JapanesePartOfSpeechStopFilterFactory factory = new JapanesePartOfSpeechStopFilterFactory(); - Map args = new HashMap(); - args.put("tags", "stoptags.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(new StringMockSolrResourceLoader(tags)); - ts = factory.create(ts); - assertTokenStreamContents(ts, - new String[] { "私", "は", "制限", "スピード", "を" } - ); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestJapaneseReadingFormFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestJapaneseReadingFormFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestJapaneseReadingFormFilterFactory.java (working copy) @@ -1,44 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.solr.core.SolrResourceLoader; - -import java.io.IOException; -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; - -/** - * Simple tests for {@link JapaneseReadingFormFilterFactory} - */ -public class TestJapaneseReadingFormFilterFactory extends BaseTokenStreamTestCase { - public void testReadings() throws IOException { - JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(); - Map args = Collections.emptyMap(); - tokenizerFactory.init(args); - tokenizerFactory.inform(new SolrResourceLoader(null, null)); - TokenStream tokenStream = tokenizerFactory.create(new StringReader("先ほどベルリンから来ました。")); - JapaneseReadingFormFilterFactory filterFactory = new JapaneseReadingFormFilterFactory(); - assertTokenStreamContents(filterFactory.create(tokenStream), - new String[] { "サキ", "ホド", "ベルリン", "カラ", "キ", "マシ", "タ" } - ); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestJapaneseTokenizerFactory.java (working copy) @@ -1,119 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; -import org.apache.solr.core.SolrResourceLoader; - -/** - * Simple tests for {@link JapaneseTokenizerFactory} - */ -public class TestJapaneseTokenizerFactory extends BaseTokenStreamTestCase { - public void testSimple() throws IOException { - JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - factory.inform(new SolrResourceLoader(null, null)); - TokenStream ts = factory.create(new StringReader("これは本ではない")); - assertTokenStreamContents(ts, - new String[] { "これ", "は", "本", "で", "は", "ない" }, - new int[] { 0, 2, 3, 4, 5, 6 }, - new int[] { 2, 3, 4, 5, 6, 8 } - ); - } - - /** - * Test that search mode is enabled and working by default - */ - public void testDefaults() throws IOException { - JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - factory.inform(new SolrResourceLoader(null, null)); - TokenStream ts = factory.create(new StringReader("シニアソフトウェアエンジニア")); - assertTokenStreamContents(ts, - new String[] { "シニア", "シニアソフトウェアエンジニア", "ソフトウェア", "エンジニア" } - ); - } - - /** - * Test mode parameter: specifying normal mode - */ - public void testMode() throws IOException { - JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); - Map args = new HashMap(); - args.put("mode", "normal"); - factory.init(args); - factory.inform(new SolrResourceLoader(null, null)); - TokenStream ts = factory.create(new StringReader("シニアソフトウェアエンジニア")); - assertTokenStreamContents(ts, - new String[] { "シニアソフトウェアエンジニア" } - ); - } - - /** - * Test user dictionary - */ - public void testUserDict() throws IOException { - String userDict = - "# Custom segmentation for long entries\n" + - "日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞\n" + - "関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,テスト名詞\n" + - "# Custom reading for sumo wrestler\n" + - "朝青龍,朝青龍,アサショウリュウ,カスタム人名\n"; - JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); - Map args = new HashMap(); - args.put("userDictionary", "userdict.txt"); - factory.init(args); - factory.inform(new StringMockSolrResourceLoader(userDict)); - TokenStream ts = factory.create(new StringReader("関西国際空港に行った")); - assertTokenStreamContents(ts, - new String[] { "関西", "国際", "空港", "に", "行っ", "た" } - ); - } - - /** - * Test preserving punctuation - */ - public void testPreservePunctuation() throws IOException { - JapaneseTokenizerFactory factory = new JapaneseTokenizerFactory(); - Map args = new HashMap(); - args.put("discardPunctuation", "false"); - factory.init(args); - factory.inform(new SolrResourceLoader(null, null)); - TokenStream ts = factory.create( - new StringReader("今ノルウェーにいますが、来週の頭日本に戻ります。楽しみにしています!お寿司が食べたいな。。。") - ); - System.out.println(ts.toString()); - assertTokenStreamContents(ts, - new String[] { "今", "ノルウェー", "に", "い", "ます", "が", "、", - "来週", "の", "頭", "日本", "に", "戻り", "ます", "。", - "楽しみ", "に", "し", "て", "い", "ます", "!", - "お", "寿司", "が", "食べ", "たい", "な", "。", "。", "。"} - ); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestKeepFilterFactory.java (working copy) @@ -1,60 +0,0 @@ -package org.apache.solr.analysis; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -import java.util.Map; -import java.util.HashMap; - -/** - * - * - **/ -public class TestKeepFilterFactory extends BaseTokenStreamTestCase { - - public void testInform() throws Exception { - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - assertTrue("loader is null and it shouldn't be", loader != null); - KeepWordFilterFactory factory = new KeepWordFilterFactory(); - Map args = new HashMap(); - args.put("words", "keep-1.txt"); - args.put("ignoreCase", "true"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - CharArraySet words = factory.getWords(); - assertTrue("words is null and it shouldn't be", words != null); - assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); - - - factory = new KeepWordFilterFactory(); - args.put("words", "keep-1.txt, keep-2.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - words = factory.getWords(); - assertTrue("words is null and it shouldn't be", words != null); - assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); - - - - } -} \ No newline at end of file Index: solr/core/src/test/org/apache/solr/analysis/TestKeywordMarkerFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestKeywordMarkerFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestKeywordMarkerFilterFactory.java (working copy) @@ -1,68 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.en.PorterStemFilter; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -/** - * Simple tests to ensure the keyword marker filter factory is working. - */ -public class TestKeywordMarkerFilterFactory extends BaseTokenStreamTestCase { - public void testKeywords() throws IOException { - Reader reader = new StringReader("dogs cats"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory(); - Map args = new HashMap(); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - args.put("protected", "protwords.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - - TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); - assertTokenStreamContents(ts, new String[] { "dog", "cats" }); - } - - public void testKeywordsCaseInsensitive() throws IOException { - Reader reader = new StringReader("dogs cats Cats"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - KeywordMarkerFilterFactory factory = new KeywordMarkerFilterFactory(); - Map args = new HashMap(); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - args.put("protected", "protwords.txt"); - args.put("ignoreCase", "true"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - - TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); - assertTokenStreamContents(ts, new String[] { "dog", "cats", "Cats" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestKStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestKStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestKStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Simple tests to ensure the kstem filter factory is working. - */ -public class TestKStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("bricks"); - KStemFilterFactory factory = new KStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "brick" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestLatvianStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Latvian stem factory is working. - */ -public class TestLatvianStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("tirgiem tirgus"); - LatvianStemFilterFactory factory = new LatvianStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "tirg", "tirg" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java (working copy) @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.util.LuceneTestCase; - -public class TestMappingCharFilterFactory extends LuceneTestCase { - public void testParseString() throws Exception { - - MappingCharFilterFactory f = new MappingCharFilterFactory(); - - try { - f.parseString( "\\" ); - fail( "escape character cannot be alone." ); - } - catch (InitializationException expected) {} - - assertEquals( "unexpected escaped characters", - "\\\"\n\t\r\b\f", f.parseString( "\\\\\\\"\\n\\t\\r\\b\\f" ) ); - assertEquals( "unexpected escaped characters", - "A", f.parseString( "\\u0041" ) ); - assertEquals( "unexpected escaped characters", - "AB", f.parseString( "\\u0041\\u0042" ) ); - - try { - f.parseString( "\\u000" ); - fail( "invalid length check." ); - } - catch (InitializationException expected) {} - - try { - f.parseString( "\\u123x" ); - fail( "invalid hex number check." ); - } - catch( NumberFormatException expected ){} - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java (working copy) @@ -20,6 +20,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.synonym.SynonymFilterFactory; import org.apache.lucene.analysis.util.ResourceLoader; import java.io.ByteArrayInputStream; Index: solr/core/src/test/org/apache/solr/analysis/TestNGramFilters.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestNGramFilters.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestNGramFilters.java (working copy) @@ -1,164 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the NGram filter factories are working. - */ -public class TestNGramFilters extends BaseTokenStreamTestCase { - /** - * Test NGramTokenizerFactory - */ - public void testNGramTokenizer() throws Exception { - Reader reader = new StringReader("test"); - Map args = new HashMap(); - NGramTokenizerFactory factory = new NGramTokenizerFactory(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] { "t", "e", "s", "t", "te", "es", "st" }); - } - /** - * Test NGramTokenizerFactory with min and max gram options - */ - public void testNGramTokenizer2() throws Exception { - Reader reader = new StringReader("test"); - Map args = new HashMap(); - args.put("minGramSize", "2"); - args.put("maxGramSize", "3"); - NGramTokenizerFactory factory = new NGramTokenizerFactory(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] { "te", "es", "st", "tes", "est" }); - } - /** - * Test the NGramFilterFactory - */ - public void testNGramFilter() throws Exception { - Reader reader = new StringReader("test"); - Map args = new HashMap(); - NGramFilterFactory factory = new NGramFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "t", "e", "s", "t", "te", "es", "st" }); - } - /** - * Test the NGramFilterFactory with min and max gram options - */ - public void testNGramFilter2() throws Exception { - Reader reader = new StringReader("test"); - Map args = new HashMap(); - args.put("minGramSize", "2"); - args.put("maxGramSize", "3"); - NGramFilterFactory factory = new NGramFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "te", "es", "st", "tes", "est" }); - } - /** - * Test EdgeNGramTokenizerFactory - */ - public void testEdgeNGramTokenizer() throws Exception { - Reader reader = new StringReader("test"); - Map args = new HashMap(); - EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] { "t" }); - } - /** - * Test EdgeNGramTokenizerFactory with min and max gram size - */ - public void testEdgeNGramTokenizer2() throws Exception { - Reader reader = new StringReader("test"); - Map args = new HashMap(); - args.put("minGramSize", "1"); - args.put("maxGramSize", "2"); - EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] { "t", "te" }); - } - /** - * Test EdgeNGramTokenizerFactory with side option - */ - public void testEdgeNGramTokenizer3() throws Exception { - Reader reader = new StringReader("ready"); - Map args = new HashMap(); - args.put("side", "back"); - EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] { "y" }); - } - /** - * Test EdgeNGramFilterFactory - */ - public void testEdgeNGramFilter() throws Exception { - Reader reader = new StringReader("test"); - Map args = new HashMap(); - EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "t" }); - } - /** - * Test EdgeNGramFilterFactory with min and max gram size - */ - public void testEdgeNGramFilter2() throws Exception { - Reader reader = new StringReader("test"); - Map args = new HashMap(); - args.put("minGramSize", "1"); - args.put("maxGramSize", "2"); - EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "t", "te" }); - } - /** - * Test EdgeNGramFilterFactory with side option - */ - public void testEdgeNGramFilter3() throws Exception { - Reader reader = new StringReader("ready"); - Map args = new HashMap(); - args.put("side", "back"); - EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "y" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestNorwegianLightStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestNorwegianLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestNorwegianLightStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Norwegian Light stem factory is working. - */ -public class TestNorwegianLightStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("epler eple"); - NorwegianLightStemFilterFactory factory = new NorwegianLightStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "epl", "epl" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestNorwegianMinimalStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestNorwegianMinimalStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestNorwegianMinimalStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Norwegian Minimal stem factory is working. - */ -public class TestNorwegianMinimalStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("eple eplet epler eplene eplets eplenes"); - NorwegianMinimalStemFilterFactory factory = new NorwegianMinimalStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "epl", "epl", "epl", "epl", "epl", "epl" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java (working copy) @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.IOException; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.*; - -/** - * Simple tests to ensure this factory is working - */ -public class TestPatternReplaceCharFilterFactory extends BaseTokenStreamTestCase { - - // 1111 - // 01234567890123 - // this is test. - public void testNothingChange() throws IOException { - final String BLOCK = "this is test."; - PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); - Map args = new HashMap(); - args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); - args.put("replacement", "$1$2$3"); - factory.init(args); - CharFilter cs = factory.create( - new StringReader( BLOCK ) ); - TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); - assertTokenStreamContents(ts, - new String[] { "this", "is", "test." }, - new int[] { 0, 5, 8 }, - new int[] { 4, 7, 13 }); - } - - // 012345678 - // aa bb cc - public void testReplaceByEmpty() throws IOException { - final String BLOCK = "aa bb cc"; - PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); - Map args = new HashMap(); - args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); - factory.init(args); - CharFilter cs = factory.create( - new StringReader( BLOCK ) ); - TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); - ts.reset(); - assertFalse(ts.incrementToken()); - ts.end(); - ts.close(); - } - - // 012345678 - // aa bb cc - // aa#bb#cc - public void test1block1matchSameLength() throws IOException { - final String BLOCK = "aa bb cc"; - PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory(); - Map args = new HashMap(); - args.put("pattern", "(aa)\\s+(bb)\\s+(cc)"); - args.put("replacement", "$1#$2#$3"); - factory.init(args); - CharFilter cs = factory.create( - new StringReader( BLOCK ) ); - TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false); - assertTokenStreamContents(ts, - new String[] { "aa#bb#cc" }, - new int[] { 0 }, - new int[] { 8 }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java (working copy) @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -/** - * Simple tests to ensure this factory is working - */ -public class TestPatternReplaceFilterFactory extends BaseTokenStreamTestCase { - - public void testReplaceAll() throws Exception { - String input = "aabfooaabfooabfoob ab caaaaaaaaab"; - PatternReplaceFilterFactory factory = new PatternReplaceFilterFactory(); - Map args = new HashMap(); - args.put("pattern", "a*b"); - args.put("replacement", "-"); - factory.init(args); - TokenStream ts = factory.create - (new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false)); - - assertTokenStreamContents(ts, - new String[] { "-foo-foo-foo-", "-", "c-" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java (working copy) @@ -1,41 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; - -/** Simple Tests to ensure this factory is working */ -public class TestPatternTokenizerFactory extends BaseTokenStreamTestCase { - public void testFactory() throws Exception { - final String INPUT = "Günther Günther is here"; - - // create PatternTokenizer - Map args = new HashMap(); - args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" ); - PatternTokenizerFactory tokFactory = new PatternTokenizerFactory(); - tokFactory.init( args ); - TokenStream stream = tokFactory.create( new StringReader(INPUT) ); - assertTokenStreamContents(stream, - new String[] { "Günther", "Günther", "is", "here" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestPersianNormalizationFilterFactory.java (working copy) @@ -1,42 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the Persian normalization factory is working. - */ -public class TestPersianNormalizationFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually normalizes persian text. - */ - public void testNormalization() throws Exception { - Reader reader = new StringReader("های"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - PersianNormalizationFilterFactory factory = new PersianNormalizationFilterFactory(); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "هاي" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java (working copy) @@ -1,185 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.commons.codec.language.Metaphone; -import org.apache.commons.codec.language.Caverphone2; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.util.LuceneTestCase.Slow; - - -/** - * - */ -@Slow -public class TestPhoneticFilterFactory extends BaseTokenStreamTestCase { - - private static final int REPEATS = 100000; - - /** - * Case: default - */ - public void testFactory() - { - Map args = new HashMap(); - - PhoneticFilterFactory ff = new PhoneticFilterFactory(); - - args.put( PhoneticFilterFactory.ENCODER, "Metaphone" ); - ff.init( args ); - assertTrue( ff.getEncoder() instanceof Metaphone ); - assertTrue( ff.inject ); // default - - args.put( PhoneticFilterFactory.INJECT, "false" ); - ff.init( args ); - assertFalse( ff.inject ); - - args.put( PhoneticFilterFactory.MAX_CODE_LENGTH, "2"); - ff.init( args ); - assertEquals(2,((Metaphone) ff.getEncoder()).getMaxCodeLen()); - } - - /** - * Case: Failures and Exceptions - */ - public void testFactoryCaseFailure() - { - Map args = new HashMap(); - - PhoneticFilterFactory ff = new PhoneticFilterFactory(); - try { - ff.init( args ); - fail( "missing encoder parameter" ); - } - catch( Exception ex ) {} - args.put( PhoneticFilterFactory.ENCODER, "XXX" ); - try { - ff.init( args ); - fail( "unknown encoder parameter" ); - } - catch( Exception ex ) {} - args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.NonExistence" ); - try { - ff.init( args ); - fail( "unknown encoder parameter" ); - } - catch( Exception ex ) {} - } - - /** - * Case: Reflection - */ - public void testFactoryCaseReflection() - { - Map args = new HashMap(); - - PhoneticFilterFactory ff = new PhoneticFilterFactory(); - - args.put( PhoneticFilterFactory.ENCODER, "org.apache.commons.codec.language.Metaphone" ); - ff.init( args ); - assertTrue( ff.getEncoder() instanceof Metaphone ); - assertTrue( ff.inject ); // default - - // we use "Caverphone2" as it is registered in the REGISTRY as Caverphone, - // so this effectively tests reflection without package name - args.put( PhoneticFilterFactory.ENCODER, "Caverphone2" ); - ff.init( args ); - assertTrue( ff.getEncoder() instanceof Caverphone2 ); - assertTrue( ff.inject ); // default - - // cross check with registry - args.put( PhoneticFilterFactory.ENCODER, "Caverphone" ); - ff.init( args ); - assertTrue( ff.getEncoder() instanceof Caverphone2 ); - assertTrue( ff.inject ); // default - } - - public void testAlgorithms() throws Exception { - assertAlgorithm("Metaphone", "true", "aaa bbb ccc easgasg", - new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" }); - assertAlgorithm("Metaphone", "false", "aaa bbb ccc easgasg", - new String[] { "A", "B", "KKK", "ESKS" }); - - assertAlgorithm("DoubleMetaphone", "true", "aaa bbb ccc easgasg", - new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" }); - assertAlgorithm("DoubleMetaphone", "false", "aaa bbb ccc easgasg", - new String[] { "A", "PP", "KK", "ASKS" }); - - assertAlgorithm("Soundex", "true", "aaa bbb ccc easgasg", - new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" }); - assertAlgorithm("Soundex", "false", "aaa bbb ccc easgasg", - new String[] { "A000", "B000", "C000", "E220" }); - - assertAlgorithm("RefinedSoundex", "true", "aaa bbb ccc easgasg", - new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" }); - assertAlgorithm("RefinedSoundex", "false", "aaa bbb ccc easgasg", - new String[] { "A0", "B1", "C3", "E034034" }); - - assertAlgorithm("Caverphone", "true", "Darda Karleen Datha Carlene", - new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen", - "TTA1111111", "Datha", "KLN1111111", "Carlene" }); - assertAlgorithm("Caverphone", "false", "Darda Karleen Datha Carlene", - new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" }); - - assertAlgorithm("ColognePhonetic", "true", "Meier Schmitt Meir Schmidt", - new String[] { "67", "Meier", "862", "Schmitt", - "67", "Meir", "862", "Schmidt" }); - assertAlgorithm("ColognePhonetic", "false", "Meier Schmitt Meir Schmidt", - new String[] { "67", "862", "67", "862" }); - } - - static void assertAlgorithm(String algName, String inject, String input, - String[] expected) throws Exception { - Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false); - Map args = new HashMap(); - args.put("encoder", algName); - args.put("inject", inject); - PhoneticFilterFactory factory = new PhoneticFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, expected); - } - - public void testSpeed() throws Exception { - checkSpeedEncoding("Metaphone", "easgasg", "ESKS"); - checkSpeedEncoding("DoubleMetaphone", "easgasg", "ASKS"); - checkSpeedEncoding("Soundex", "easgasg", "E220"); - checkSpeedEncoding("RefinedSoundex", "easgasg", "E034034"); - checkSpeedEncoding("Caverphone", "Carlene", "KLN1111111"); - checkSpeedEncoding("ColognePhonetic", "Schmitt", "862"); - } - - private void checkSpeedEncoding(String encoder, String toBeEncoded, String estimated) throws Exception { - long start = System.currentTimeMillis(); - for ( int i=0; i toks = Arrays.asList(tokens).iterator(); - RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory(); - final TokenStream ts = factory.create - (new TokenStream() { - CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - @Override - public boolean incrementToken() { - if (toks.hasNext()) { - clearAttributes(); - Token tok = toks.next(); - termAtt.setEmpty().append(tok); - offsetAtt.setOffset(tok.startOffset(), tok.endOffset()); - posIncAtt.setPositionIncrement(tok.getPositionIncrement()); - return true; - } else { - return false; - } - } - }); - - assertTokenStreamContents(ts, expected.split("\\s")); - } - - public void testSimpleDups() throws Exception { - testDups("A B C D E" - ,tok(1,"A", 0, 4) - ,tok(1,"B", 5, 10) - ,tok(0,"B",11, 15) - ,tok(1,"C",16, 20) - ,tok(0,"D",16, 20) - ,tok(1,"E",21, 25) - ); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestReverseStringFilterFactory.java (working copy) @@ -1,47 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the Reverse string filter factory is working. - */ -public class TestReverseStringFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually reverses text. - */ - public void testReversing() throws Exception { - Reader reader = new StringReader("simple test"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - ReverseStringFilterFactory factory = new ReverseStringFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "elpmis", "tset" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestRussianLightStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestRussianLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestRussianLightStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Russian light stem factory is working. - */ -public class TestRussianLightStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("журналы"); - RussianLightStemFilterFactory factory = new RussianLightStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "журнал" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestShingleFilterFactory.java (working copy) @@ -1,239 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Shingle filter factory works. - */ -public class TestShingleFilterFactory extends BaseTokenStreamTestCase { - /** - * Test the defaults - */ - public void testDefaults() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] {"this", "this is", "is", - "is a", "a", "a test", "test"}); - } - - /** - * Test with unigrams disabled - */ - public void testNoUnigrams() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("outputUnigrams", "false"); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] {"this is", "is a", "a test"}); - } - - /** - * Test with a higher max shingle size - */ - public void testMaxShingleSize() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("maxShingleSize", "3"); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] {"this", "this is", "this is a", "is", - "is a", "is a test", "a", "a test", "test"}); - } - - /** - * Test with higher min (and max) shingle size - */ - public void testMinShingleSize() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("minShingleSize", "3"); - args.put("maxShingleSize", "4"); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "this", "this is a", "this is a test", - "is", "is a test", "a", "test" }); - } - - /** - * Test with higher min (and max) shingle size and with unigrams disabled - */ - public void testMinShingleSizeNoUnigrams() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("minShingleSize", "3"); - args.put("maxShingleSize", "4"); - args.put("outputUnigrams", "false"); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "this is a", "this is a test", "is a test" }); - } - - /** - * Test with higher same min and max shingle size - */ - public void testEqualMinAndMaxShingleSize() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("minShingleSize", "3"); - args.put("maxShingleSize", "3"); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "this", "this is a", "is", "is a test", "a", "test" }); - } - - /** - * Test with higher same min and max shingle size and with unigrams disabled - */ - public void testEqualMinAndMaxShingleSizeNoUnigrams() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("minShingleSize", "3"); - args.put("maxShingleSize", "3"); - args.put("outputUnigrams", "false"); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "this is a", "is a test" }); - } - - /** - * Test with a non-default token separator - */ - public void testTokenSeparator() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("tokenSeparator", "=BLAH="); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "this", "this=BLAH=is", "is", "is=BLAH=a", - "a", "a=BLAH=test", "test" }); - } - - /** - * Test with a non-default token separator and with unigrams disabled - */ - public void testTokenSeparatorNoUnigrams() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("tokenSeparator", "=BLAH="); - args.put("outputUnigrams", "false"); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "this=BLAH=is", "is=BLAH=a", "a=BLAH=test" }); - } - - /** - * Test with an empty token separator - */ - public void testEmptyTokenSeparator() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("tokenSeparator", ""); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "this", "thisis", "is", "isa", "a", "atest", "test" }); - } - - /** - * Test with higher min (and max) shingle size - * and with a non-default token separator - */ - public void testMinShingleSizeAndTokenSeparator() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("minShingleSize", "3"); - args.put("maxShingleSize", "4"); - args.put("tokenSeparator", "=BLAH="); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "this", "this=BLAH=is=BLAH=a", - "this=BLAH=is=BLAH=a=BLAH=test", "is", - "is=BLAH=a=BLAH=test", "a", "test" }); - } - - /** - * Test with higher min (and max) shingle size - * and with a non-default token separator - * and with unigrams disabled - */ - public void testMinShingleSizeAndTokenSeparatorNoUnigrams() throws Exception { - Reader reader = new StringReader("this is a test"); - Map args = new HashMap(); - args.put("minShingleSize", "3"); - args.put("maxShingleSize", "4"); - args.put("tokenSeparator", "=BLAH="); - args.put("outputUnigrams", "false"); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, - new String[] { "this=BLAH=is=BLAH=a", "this=BLAH=is=BLAH=a=BLAH=test", - "is=BLAH=a=BLAH=test", }); - } - - /** - * Test with unigrams disabled except when there are no shingles, with - * a single input token. Using default min/max shingle sizes: 2/2. No - * shingles will be created, since there are fewer input tokens than - * min shingle size. However, because outputUnigramsIfNoShingles is - * set to true, even though outputUnigrams is set to false, one - * unigram should be output. - */ - public void testOutputUnigramsIfNoShingles() throws Exception { - Reader reader = new StringReader("test"); - Map args = new HashMap(); - args.put("outputUnigrams", "false"); - args.put("outputUnigramsIfNoShingles", "true"); - ShingleFilterFactory factory = new ShingleFilterFactory(); - factory.init(args); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "test" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestSpanishLightStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestSpanishLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestSpanishLightStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Spanish Light stem factory is working. - */ -public class TestSpanishLightStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("sociedades"); - SpanishLightStemFilterFactory factory = new SpanishLightStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "sociedad" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestStandardFactories.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestStandardFactories.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestStandardFactories.java (working copy) @@ -1,186 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the standard lucene factories are working. - */ -public class TestStandardFactories extends BaseTokenStreamTestCase { - /** - * Test StandardTokenizerFactory - */ - public void testStandardTokenizer() throws Exception { - Reader reader = new StringReader("Wha\u0301t's this thing do?"); - StandardTokenizerFactory factory = new StandardTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"Wha\u0301t's", "this", "thing", "do" }); - } - - public void testStandardTokenizerMaxTokenLength() throws Exception { - StringBuilder builder = new StringBuilder(); - for (int i = 0 ; i < 100 ; ++i) { - builder.append("abcdefg"); // 7 * 100 = 700 char "word" - } - String longWord = builder.toString(); - String content = "one two three " + longWord + " four five six"; - Reader reader = new StringReader(content); - Map args = new HashMap(); - args.put("maxTokenLength", "1000"); - StandardTokenizerFactory factory = new StandardTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"one", "two", "three", longWord, "four", "five", "six" }); - } - - /** - * Test ClassicTokenizerFactory - */ - public void testClassicTokenizer() throws Exception { - Reader reader = new StringReader("What's this thing do?"); - ClassicTokenizerFactory factory = new ClassicTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"What's", "this", "thing", "do" }); - } - - public void testClassicTokenizerMaxTokenLength() throws Exception { - StringBuilder builder = new StringBuilder(); - for (int i = 0 ; i < 100 ; ++i) { - builder.append("abcdefg"); // 7 * 100 = 700 char "word" - } - String longWord = builder.toString(); - String content = "one two three " + longWord + " four five six"; - Reader reader = new StringReader(content); - Map args = new HashMap(); - args.put("maxTokenLength", "1000"); - ClassicTokenizerFactory factory = new ClassicTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"one", "two", "three", longWord, "four", "five", "six" }); - } - - /** - * Test ClassicFilterFactory - */ - public void testStandardFilter() throws Exception { - Reader reader = new StringReader("What's this thing do?"); - ClassicTokenizerFactory factory = new ClassicTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - ClassicFilterFactory filterFactory = new ClassicFilterFactory(); - filterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - filterFactory.init(args); - Tokenizer tokenizer = factory.create(reader); - TokenStream stream = filterFactory.create(tokenizer); - assertTokenStreamContents(stream, - new String[] {"What", "this", "thing", "do"}); - } - - /** - * Test KeywordTokenizerFactory - */ - public void testKeywordTokenizer() throws Exception { - Reader reader = new StringReader("What's this thing do?"); - KeywordTokenizerFactory factory = new KeywordTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"What's this thing do?"}); - } - - /** - * Test WhitespaceTokenizerFactory - */ - public void testWhitespaceTokenizer() throws Exception { - Reader reader = new StringReader("What's this thing do?"); - WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"What's", "this", "thing", "do?"}); - } - - /** - * Test LetterTokenizerFactory - */ - public void testLetterTokenizer() throws Exception { - Reader reader = new StringReader("What's this thing do?"); - LetterTokenizerFactory factory = new LetterTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"What", "s", "this", "thing", "do"}); - } - - /** - * Test LowerCaseTokenizerFactory - */ - public void testLowerCaseTokenizer() throws Exception { - Reader reader = new StringReader("What's this thing do?"); - LowerCaseTokenizerFactory factory = new LowerCaseTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"what", "s", "this", "thing", "do"}); - } - - /** - * Ensure the ASCIIFoldingFilterFactory works - */ - public void testASCIIFolding() throws Exception { - Reader reader = new StringReader("Česká"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - ASCIIFoldingFilterFactory factory = new ASCIIFoldingFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "Ceska" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestStemmerOverrideFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestStemmerOverrideFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestStemmerOverrideFilterFactory.java (working copy) @@ -1,69 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.en.PorterStemFilter; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -/** - * Simple tests to ensure the stemmer override filter factory is working. - */ -public class TestStemmerOverrideFilterFactory extends BaseTokenStreamTestCase { - public void testKeywords() throws IOException { - // our stemdict stems dogs to 'cat' - Reader reader = new StringReader("testing dogs"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory(); - Map args = new HashMap(); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - args.put("dictionary", "stemdict.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - - TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); - assertTokenStreamContents(ts, new String[] { "test", "cat" }); - } - - public void testKeywordsCaseInsensitive() throws IOException { - Reader reader = new StringReader("testing DoGs"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - StemmerOverrideFilterFactory factory = new StemmerOverrideFilterFactory(); - Map args = new HashMap(); - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - args.put("dictionary", "stemdict.txt"); - args.put("ignoreCase", "true"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - - TokenStream ts = new PorterStemFilter(factory.create(tokenizer)); - assertTokenStreamContents(ts, new String[] { "test", "cat" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestStopFilterFactory.java (working copy) @@ -1,76 +0,0 @@ -package org.apache.solr.analysis; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -import java.util.Map; -import java.util.HashMap; - -/** - * - * - **/ -public class TestStopFilterFactory extends BaseTokenStreamTestCase { - - public void testInform() throws Exception { - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - assertTrue("loader is null and it shouldn't be", loader != null); - StopFilterFactory factory = new StopFilterFactory(); - Map args = new HashMap(); - args.put("words", "stop-1.txt"); - args.put("ignoreCase", "true"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - CharArraySet words = factory.getStopWords(); - assertTrue("words is null and it shouldn't be", words != null); - assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2); - assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true); - - factory = new StopFilterFactory(); - args.put("words", "stop-1.txt, stop-2.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - words = factory.getStopWords(); - assertTrue("words is null and it shouldn't be", words != null); - assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4); - assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true); - - factory = new StopFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - args.put("words", "stop-snowball.txt"); - args.put("format", "snowball"); - factory.init(args); - factory.inform(loader); - words = factory.getStopWords(); - assertEquals(8, words.size()); - assertTrue(words.contains("he")); - assertTrue(words.contains("him")); - assertTrue(words.contains("his")); - assertTrue(words.contains("himself")); - assertTrue(words.contains("she")); - assertTrue(words.contains("her")); - assertTrue(words.contains("hers")); - assertTrue(words.contains("herself")); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestSwedishLightStemFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestSwedishLightStemFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestSwedishLightStemFilterFactory.java (working copy) @@ -1,37 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure the Swedish Light stem factory is working. - */ -public class TestSwedishLightStemFilterFactory extends BaseTokenStreamTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("äpplen äpple"); - SwedishLightStemFilterFactory factory = new SwedishLightStemFilterFactory(); - TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(stream, new String[] { "äppl", "äppl" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestSynonymFilterFactory.java (working copy) @@ -1,83 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.StringReader; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.synonym.SynonymFilter; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; - -public class TestSynonymFilterFactory extends BaseTokenStreamTestCase { - /** test that we can parse and use the solr syn file */ - public void testSynonyms() throws Exception { - SynonymFilterFactory factory = new SynonymFilterFactory(); - Map args = new HashMap(); - args.put("synonyms", "synonyms.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(new SolrResourceLoader("solr/collection1")); - TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); - assertTrue(ts instanceof SynonymFilter); - assertTokenStreamContents(ts, - new String[] { "GB", "gib", "gigabyte", "gigabytes" }, - new int[] { 1, 0, 0, 0 }); - } - - /** if the synonyms are completely empty, test that we still analyze correctly */ - public void testEmptySynonyms() throws Exception { - SynonymFilterFactory factory = new SynonymFilterFactory(); - Map args = new HashMap(); - args.put("synonyms", "synonyms.txt"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(new StringMockSolrResourceLoader("")); // empty file! - TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); - assertTokenStreamContents(ts, new String[] { "GB" }); - } - - private class StringMockSolrResourceLoader implements ResourceLoader { - String text; - - StringMockSolrResourceLoader(String text) { - this.text = text; - } - - public List getLines(String resource) throws IOException { - return Arrays.asList(text.split("\n")); - } - - public T newInstance(String cname, Class expectedType, String... subpackages) { - return null; - } - - public InputStream openResource(String resource) throws IOException { - return new ByteArrayInputStream(text.getBytes("UTF-8")); - } - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestThaiWordFilterFactory.java (working copy) @@ -1,50 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.Collections; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.th.ThaiWordFilter; - -/** - * Simple tests to ensure the Thai word filter factory is working. - */ -public class TestThaiWordFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually decomposes text. - */ - public void testWordBreak() throws Exception { - assumeTrue("JRE does not support Thai dictionary-based BreakIterator", ThaiWordFilter.DBBI_AVAILABLE); - Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - ThaiWordFilterFactory factory = new ThaiWordFilterFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] {"การ", "ที่", "ได้", - "ต้อง", "แสดง", "ว่า", "งาน", "ดี"}); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestTrimFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestTrimFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestTrimFilterFactory.java (working copy) @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.solr.analysis; - -import java.io.StringReader; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; - -/** - * Simple tests to ensure this factory is working - */ -public class TestTrimFilterFactory extends BaseTokenStreamTestCase { - public void testTrimming() throws Exception { - TrimFilterFactory factory = new TrimFilterFactory(); - Map args = new HashMap(); - args.put("updateOffsets", "false"); - factory.init(args); - TokenStream ts = factory.create(new MockTokenizer(new StringReader("trim me "), MockTokenizer.KEYWORD, false)); - assertTokenStreamContents(ts, new String[] { "trim me" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestTurkishLowerCaseFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestTurkishLowerCaseFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestTurkishLowerCaseFilterFactory.java (working copy) @@ -1,42 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; - -/** - * Simple tests to ensure the Turkish lowercase filter factory is working. - */ -public class TestTurkishLowerCaseFilterFactory extends BaseTokenStreamTestCase { - /** - * Ensure the filter actually lowercases text. - */ - public void testCasing() throws Exception { - Reader reader = new StringReader("AĞACI"); - Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); - TurkishLowerCaseFilterFactory factory = new TurkishLowerCaseFilterFactory(); - TokenStream stream = factory.create(tokenizer); - assertTokenStreamContents(stream, new String[] { "ağacı" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestTypeTokenFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestTypeTokenFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestTypeTokenFilterFactory.java (working copy) @@ -1,105 +0,0 @@ -package org.apache.solr.analysis; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.NumericTokenStream; -import org.apache.lucene.analysis.util.InitializationException; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.solr.core.SolrResourceLoader; -import org.junit.Test; - -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -/** - * Testcase for {@link TypeTokenFilterFactory} - */ -public class TestTypeTokenFilterFactory extends BaseTokenStreamTestCase { - - @Test - public void testInform() throws Exception { - ResourceLoader loader = new SolrResourceLoader("solr/collection1"); - TypeTokenFilterFactory factory = new TypeTokenFilterFactory(); - Map args = new HashMap(); - args.put("types", "stoptypes-1.txt"); - args.put("enablePositionIncrements", "true"); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - factory.inform(loader); - Set types = factory.getStopTypes(); - assertTrue("types is null and it shouldn't be", types != null); - assertTrue("types Size: " + types.size() + " is not: " + 2, types.size() == 2); - assertTrue("enablePositionIncrements was set to true but not correctly parsed", factory.isEnablePositionIncrements()); - - factory = new TypeTokenFilterFactory(); - args.put("types", "stoptypes-1.txt, stoptypes-2.txt"); - args.put("enablePositionIncrements", "false"); - args.put("useWhitelist","true"); - factory.init(args); - factory.inform(loader); - types = factory.getStopTypes(); - assertTrue("types is null and it shouldn't be", types != null); - assertTrue("types Size: " + types.size() + " is not: " + 4, types.size() == 4); - assertTrue("enablePositionIncrements was set to false but not correctly parsed", !factory.isEnablePositionIncrements()); - } - - @Test - public void testCreationWithBlackList() throws Exception { - TypeTokenFilterFactory typeTokenFilterFactory = new TypeTokenFilterFactory(); - Map args = new HashMap(); - args.put("types", "stoptypes-1.txt, stoptypes-2.txt"); - args.put("enablePositionIncrements", "false"); - typeTokenFilterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - typeTokenFilterFactory.init(args); - NumericTokenStream input = new NumericTokenStream(); - input.setIntValue(123); - typeTokenFilterFactory.create(input); - } - - @Test - public void testCreationWithWhiteList() throws Exception { - TypeTokenFilterFactory typeTokenFilterFactory = new TypeTokenFilterFactory(); - Map args = new HashMap(); - args.put("types", "stoptypes-1.txt, stoptypes-2.txt"); - args.put("enablePositionIncrements", "false"); - args.put("useWhitelist","true"); - typeTokenFilterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - typeTokenFilterFactory.init(args); - NumericTokenStream input = new NumericTokenStream(); - input.setIntValue(123); - typeTokenFilterFactory.create(input); - } - - @Test - public void testMissingTypesParameter() throws Exception { - try { - TypeTokenFilterFactory typeTokenFilterFactory = new TypeTokenFilterFactory(); - Map args = new HashMap(); - args.put("enablePositionIncrements", "false"); - typeTokenFilterFactory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - typeTokenFilterFactory.init(args); - typeTokenFilterFactory.inform(new SolrResourceLoader(null, null)); - fail("not supplying 'types' parameter should cause an InitializationException"); - } catch (InitializationException e) { - // everything ok - } - } - -} Index: solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestUAX29URLEmailTokenizerFactory.java (working copy) @@ -1,193 +0,0 @@ -package org.apache.solr.analysis; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.util.Version; - -/** - * A few tests based on org.apache.lucene.analysis.TestUAX29URLEmailTokenizer - */ - -public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamTestCase { - - public void testUAX29URLEmailTokenizer() throws Exception { - Reader reader = new StringReader("Wha\u0301t's this thing do?"); - UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"Wha\u0301t's", "this", "thing", "do" }); - } - - public void testArabic() throws Exception { - Reader reader = new StringReader("الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008."); - UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا", - "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" }); - } - - public void testChinese() throws Exception { - Reader reader = new StringReader("我是中国人。 1234 Tests "); - UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"我", "是", "中", "国", "人", "1234", "Tests"}); - } - - public void testKorean() throws Exception { - Reader reader = new StringReader("안녕하세요 한글입니다"); - UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"안녕하세요", "한글입니다"}); - } - - public void testHyphen() throws Exception { - Reader reader = new StringReader("some-dashed-phrase"); - UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"some", "dashed", "phrase"}); - } - - // Test with some URLs from TestUAX29URLEmailTokenizer's - // urls.from.random.text.with.urls.txt - public void testURLs() throws Exception { - String textWithURLs - = "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on\n" - + " some extra\nWords thrown in here. " - + "http://c5-3486.bisynxu.FR/aI.YnNms/" - + " samba Halta gamba " - + "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n" - + "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n" - + "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m" - + " inter Locutio " - + "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n" - + "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7" - + " blah Sirrah woof " - + "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n"; - Reader reader = new StringReader(textWithURLs); - UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] { - "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram¶graphs=50&length=200&no-ads=on", - "some", "extra", "Words", "thrown", "in", "here", - "http://c5-3486.bisynxu.FR/aI.YnNms/", - "samba", "Halta", "gamba", - "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R", - "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb", - "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m", - "inter", "Locutio", - "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/", - "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7", - "blah", "Sirrah", "woof", - "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4" - } - ); - } - - // Test with some emails from TestUAX29URLEmailTokenizer's - // email.addresses.from.random.text.with.email.addresses.txt - public void testEmails() throws Exception { - String textWithEmails - = " some extra\nWords thrown in here. " - + "dJ8ngFi@avz13m.CC\n" - + "kU-l6DS@[082.015.228.189]\n" - + "\"%U\u0012@?\\B\"@Fl2d.md" - + " samba Halta gamba " - + "Bvd#@tupjv.sn\n" - + "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt\n" - + "~+Kdz@3mousnl.SE\n" - + " inter Locutio " - + "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY\n" - + "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM" - + " blah Sirrah woof " - + "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae\n" - + "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H\n"; - Reader reader = new StringReader(textWithEmails); - UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - Map args = Collections.emptyMap(); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] { - "some", "extra", "Words", "thrown", "in", "here", - "dJ8ngFi@avz13m.CC", - "kU-l6DS@[082.015.228.189]", - "\"%U\u0012@?\\B\"@Fl2d.md", - "samba", "Halta", "gamba", - "Bvd#@tupjv.sn", - "SBMm0Nm.oyk70.rMNdd8k.#ru3LI.gMMLBI.0dZRD4d.RVK2nY@au58t.B13albgy4u.mt", - "~+Kdz@3mousnl.SE", - "inter", "Locutio", - "C'ts`@Vh4zk.uoafcft-dr753x4odt04q.UY", - "}0tzWYDBuy@cSRQAABB9B.7c8xawf75-cyo.PM", - "blah", "Sirrah", "woof", - "lMahAA.j/5.RqUjS745.DtkcYdi@d2-4gb-l6.ae", - "lv'p@tqk.vj5s0tgl.0dlu7su3iyiaz.dqso.494.3hb76.XN--MGBAAM7A8H" - } - ); - } - - public void testMaxTokenLength() throws Exception { - StringBuilder builder = new StringBuilder(); - for (int i = 0 ; i < 100 ; ++i) { - builder.append("abcdefg"); // 7 * 100 = 700 char "word" - } - String longWord = builder.toString(); - String content = "one two three " + longWord + " four five six"; - Reader reader = new StringReader(content); - Map args = new HashMap(); - args.put("maxTokenLength", "1000"); - UAX29URLEmailTokenizerFactory factory = new UAX29URLEmailTokenizerFactory(); - factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); - factory.init(args); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] {"one", "two", "three", longWord, "four", "five", "six" }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestWikipediaTokenizerFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestWikipediaTokenizerFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestWikipediaTokenizerFactory.java (working copy) @@ -1,43 +0,0 @@ -package org.apache.solr.analysis; - -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Simple tests to ensure the wikipedia tokenizer is working. - */ -public class TestWikipediaTokenizerFactory extends BaseTokenStreamTestCase { - public void testTokenizer() throws IOException { - Reader reader = new StringReader("This is a [[Category:foo]]"); - WikipediaTokenizerFactory factory = new WikipediaTokenizerFactory(); - Tokenizer tokenizer = factory.create(reader); - assertTokenStreamContents(tokenizer, - new String[] { "This", "is", "a", "foo" }, - new int[] { 0, 5, 8, 21 }, - new int[] { 4, 7, 9, 24 }, - new String[] { "", "", "", WikipediaTokenizer.CATEGORY }, - new int[] { 1, 1, 1, 1, }); - } -} Index: solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java =================================================================== --- solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/analysis/TestWordDelimiterFilterFactory.java (working copy) @@ -1,3 +1,5 @@ +package org.apache.solr.analysis; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -15,8 +17,6 @@ * limitations under the License. */ -package org.apache.solr.analysis; - import java.io.StringReader; import java.util.HashMap; import java.util.Map; @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.solr.SolrTestCaseJ4; +import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory; import org.apache.lucene.analysis.util.ResourceLoader; import org.apache.solr.core.SolrResourceLoader; import org.junit.BeforeClass; @@ -33,6 +34,7 @@ /** * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest */ +// TODO: add a low-level test for this factory public class TestWordDelimiterFilterFactory extends SolrTestCaseJ4 { @BeforeClass Index: solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java =================================================================== --- solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/core/ResourceLoaderTest.java (working copy) @@ -20,8 +20,8 @@ import junit.framework.Assert; import org.apache.lucene.util.LuceneTestCase; -import org.apache.solr.analysis.KeywordTokenizerFactory; -import org.apache.solr.analysis.NGramFilterFactory; +import org.apache.lucene.analysis.core.KeywordTokenizerFactory; +import org.apache.lucene.analysis.ngram.NGramFilterFactory; import org.apache.solr.common.SolrException; import org.apache.solr.handler.admin.LukeRequestHandler; import org.apache.solr.handler.component.FacetComponent; Index: solr/core/src/test/org/apache/solr/schema/MultiTermTest.java =================================================================== --- solr/core/src/test/org/apache/solr/schema/MultiTermTest.java (revision 1365483) +++ solr/core/src/test/org/apache/solr/schema/MultiTermTest.java (working copy) @@ -18,6 +18,12 @@ */ import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory; +import org.apache.lucene.analysis.core.KeywordTokenizerFactory; +import org.apache.lucene.analysis.core.LowerCaseFilterFactory; +import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory; +import org.apache.lucene.analysis.miscellaneous.TrimFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.analysis.*; Index: solr/example =================================================================== --- solr/example (revision 1365483) +++ solr/example (working copy) Property changes on: solr/example ___________________________________________________________________ Modified: svn:mergeinfo Merged /lucene/dev/branches/lucene2510/solr/example:r1364862-1365496