Index: contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUAnalyzer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUAnalyzer.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUAnalyzer.java (revision 0) @@ -0,0 +1,89 @@ +package org.apache.lucene.analysis.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.io.Reader; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.LowerCaseFilter; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.WordlistLoader; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardFilter; + +/** + * A multilingual analyzer designed to be similar to {@link StandardAnalyzer} + * Words are broken according to word breaking rules in the Unicode standard. + * + * Depends on ICU (http://www.icu-project.org) + * + */ +public class ICUAnalyzer extends Analyzer { + private Set stopSet; + private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; + + public ICUAnalyzer() { + this(StandardAnalyzer.STOP_WORDS); + } + /** Builds an analyzer with the given stop words. */ + public ICUAnalyzer(Set stopWords) { + stopSet = stopWords; + } + + /** Builds an analyzer with the given stop words. */ + public ICUAnalyzer(String[] stopWords) { + stopSet = StopFilter.makeStopSet(stopWords); + } + + /** Builds an analyzer with the stop words from the given file. + * @see WordlistLoader#getWordSet(File) + */ + public ICUAnalyzer(File stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + /** Builds an analyzer with the stop words from the given reader. + * @see WordlistLoader#getWordSet(Reader) + */ + public ICUAnalyzer(Reader stopwords) throws IOException { + stopSet = WordlistLoader.getWordSet(stopwords); + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream wsTokenStream = new WhitespaceTokenizer(reader); + ICUTokenizer tokenStream = new ICUTokenizer(wsTokenStream); + tokenStream.setMaxTokenLength(maxTokenLength); + TokenStream result = new StandardFilter(tokenStream); + result = new LowerCaseFilter(result); + result = new StopFilter(result, stopSet); + return result; + } + + public int getMaxTokenLength() { + return maxTokenLength; + } + + public void setMaxTokenLength(int maxTokenLength) { + this.maxTokenLength = maxTokenLength; + } +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUBreakIterator.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUBreakIterator.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUBreakIterator.java (revision 0) @@ -0,0 +1,103 @@ +package org.apache.lucene.analysis.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + +import org.apache.lucene.analysis.standard.StandardTokenizer; + +import com.ibm.icu.text.RuleBasedBreakIterator; + +/** + * A RuleBasedBreakIterator that tags tokens with {@link StandardTokenizer} categories. + */ +class ICUBreakIterator extends RuleBasedBreakIterator { + protected static final int WORD_APOSTROPHE = 601; + protected static final int WORD_ACRONYM = 602; + protected static final int WORD_HOST = 603; + protected static final int WORD_COMPANY = 604; + protected static final int WORD_EMAIL = 605; + + + /** + * Create an ICUBreakIterator + * @throws IOException + */ + ICUBreakIterator() throws IOException { + this(getRules()); + } + + /** + * Create an ICUBreakIterator based upon the provided rules + * @param rules Word breaking rules + */ + ICUBreakIterator(String rules) { + super(rules); + } + + /** + * Get the {@link StandardTokenizer} token type. + * @return One of the token types defined in {@link StandardTokenizer} + */ + String getTokenType() { + switch (getRuleStatus()) { + case WORD_ACRONYM: + return StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ACRONYM]; + case WORD_EMAIL: + return StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMAIL]; + case WORD_COMPANY: + return StandardTokenizer.TOKEN_TYPES[StandardTokenizer.COMPANY]; + case WORD_HOST: + return StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]; + case WORD_NUMBER: + return StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; + case WORD_APOSTROPHE: + return StandardTokenizer.TOKEN_TYPES[StandardTokenizer.APOSTROPHE]; + case WORD_IDEO: + return StandardTokenizer.TOKEN_TYPES[StandardTokenizer.CJ]; + case WORD_KANA: + return StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; + case WORD_LETTER: + return StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; + case WORD_NONE: + default: + return null; + } + } + + /** + * Read in the rule file. + * @return Contents of rule file as a String + * @throws IOException + */ + static String getRules() throws IOException { + StringBuffer rules = new StringBuffer(); + InputStream in = ICUBreakIterator.class.getResourceAsStream("ICUBreakIterator.rul"); + BufferedReader cin = new BufferedReader(new InputStreamReader(in, "UTF-8")); + String line = null; + while ((line = cin.readLine()) != null) { + if (!line.startsWith("#")) + rules.append(line + "\n"); + } + cin.close(); + return rules.toString(); + } +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUBreakIterator.rul =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUBreakIterator.rul (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUBreakIterator.rul (revision 0) @@ -0,0 +1,157 @@ +!!chain; +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# +# A modified version of the default RBBI rules which you can see if you did this: +# RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator) +# BreakIterator.getCharacterInstance(); +# System.out.println(rbbi.toString()); +# +# Modifications include: +# 1. Proper handling of Hebrew punctuation +# The default unicode spec works great except no one really uses geresh and gershayim. +# Since single and double quotes are the reality, they are treated special in Hebrew context. +# Single quote is allowed medially and finally but only after Hebrew characters. +# Single quote can be either ' or ` +# Double quote is allowed medially, but only after Hebrew characters. +# +# 2. StandardTokenizer categories +# These are implemented as rule status codes. +# These are currently NOT completely backward compatible with StandardTokenizer +# + +$CR = [\p{Word_Break = CR}]; +$LF = [\p{Word_Break = LF}]; +$Newline = [\p{Word_Break = Newline}]; +$Extend = [\p{Word_Break = Extend}]; +$Format = [\p{Word_Break = Format}]; +$Katakana = [\p{Word_Break = Katakana}]; +$ALetter = [\p{Word_Break = ALetter}]; +$MidNumLet = [\p{Word_Break = MidNumLet}]; +$Hebrew = [\p{script=Hebrew} \uFB1D-\uFB4F]; +$MidHebrew = [\' \` \"]; +$ExtendHebrew = [\' \`]; +$MidLetter = [\p{Word_Break = MidLetter}]; +$MidNum = [\p{Word_Break = MidNum}]; +$Numeric = [\p{Word_Break = Numeric}]; +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; +$dictionary = [:LineBreak = Complex_Context:]; +$Control = [\p{Grapheme_Cluster_Break = Control}]; +$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; + +$HebrewEx = $Hebrew ($Extend | $Format)*; +$MidHebrewEx = $MidHebrew ($Extend | $Format)*; +$ExtendHebrewEx = $ExtendHebrew ($Extend | $Format)*; +$KatakanaEx = $Katakana ($Extend | $Format)*; +$ALetterEx = $ALetterPlus ($Extend | $Format)*; +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; +$MidLetterEx = $MidLetter ($Extend | $Format)*; +$MidNumEx = $MidNum ($Extend | $Format)*; +$NumericEx = $Numeric ($Extend | $Format)*; +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; +$Hiragana = [\p{script=Hiragana}]; +$Ideographic = [\p{Ideographic}]; +$HiraganaEx = $Hiragana ($Extend | $Format)*; +$IdeographicEx = $Ideographic ($Extend | $Format)*; + +# Additional items similar to StandardTokenizer +# Note: these are not used for breaking standard words, only for the special types. +# +$Alpha = [\p{Alphabetic}]; +$AlphaNum = ([\p{alnum}])+; + +# Variable AlphaMinusHebrew: used for apostrophes. +# Note: Apostrophes within hebrew context are handled separately. +$AlphaMinusHebrew = [\p{Alphabetic}-$Hebrew]; + +# Email address +$Email = $AlphaNum ([\.\-\_] $AlphaNum)* \@ $AlphaNum ([\.\-] $AlphaNum)+; +# Company +$Company = ($Alpha)+ [\&\@] ($Alpha)+; +# Acronym +$Acronym = ($Alpha)+ \. ($Alpha \.)+; +# Host +$Host = $AlphaNum (\. $AlphaNum)+; +# Apostrophe +$Apostrophe = $AlphaMinusHebrew+ (\' $AlphaMinusHebrew+)+; + +!!forward; +$CR $LF; +[^$CR $LF $Newline]? ($Extend | $Format)+; +$NumericEx {100}; +$ALetterEx {200}; +$HebrewEx {200}; +$KatakanaEx {300}; +$HiraganaEx {300}; +$IdeographicEx {400}; +$ALetterEx $ALetterEx {200}; +$HebrewEx $HebrewEx {200}; +$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; +$HebrewEx ($MidLetterEx | $MidNumLetEx | $MidHebrewEx) $HebrewEx {200}; +$NumericEx $NumericEx {100}; +$ALetterEx $NumericEx {200}; +$NumericEx $ALetterEx {200}; +$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; +$KatakanaEx $KatakanaEx {300}; +$ALetterEx $ExtendNumLetEx {200}; +$HebrewEx ($ExtendNumLetEx | $ExtendHebrewEx) {200}; +$NumericEx $ExtendNumLetEx {100}; +$KatakanaEx $ExtendNumLetEx {300}; +$ExtendNumLetEx $ExtendNumLetEx {200}; +$ExtendNumLetEx $ALetterEx {200}; +$ExtendNumLetEx $HebrewEx {200}; +$ExtendNumLetEx $NumericEx {100}; +$ExtendNumLetEx $KatakanaEx {300}; + +$Apostrophe{601}; +$Acronym{602}; +$Host{603}; +$Company{604}; +$Email{605}; + +!!reverse; +$BackALetterEx = ($Format | $Extend)* $ALetterPlus; +$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; +$BackNumericEx = ($Format | $Extend)* $Numeric; +$BackMidNumEx = ($Format | $Extend)* $MidNum; +$BackMidLetterEx = ($Format | $Extend)* $MidLetter; +$BackKatakanaEx = ($Format | $Extend)* $Katakana; +$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; + +$LF $CR; +($Format | $Extend)* [^$CR $LF $Newline]?; +$BackALetterEx $BackALetterEx; +$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; +$BackNumericEx $BackNumericEx; +$BackNumericEx $BackALetterEx; +$BackALetterEx $BackNumericEx; +$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; +$BackKatakanaEx $BackKatakanaEx; +$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); +($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; + +!!safe_reverse; +($Extend | $Format)+ .?; +($MidLetter | $MidNumLet) $BackALetterEx; +($MidNum | $MidNumLet) $BackNumericEx; +$dictionary $dictionary; + +!!safe_forward; +($Extend | $Format)+ .?; +($MidLetterEx | $MidNumLetEx) $ALetterEx; +($MidNumEx | $MidNumLetEx) $NumericEx; +$dictionary $dictionary; Index: contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUTokenizer.java =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUTokenizer.java (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/icu/ICUTokenizer.java (revision 0) @@ -0,0 +1,130 @@ +package org.apache.lucene.analysis.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +import com.ibm.icu.text.BreakIterator; + +/** + * A multilingual tokenizer based on ICU RuleBasedBreakIterator + * + * Depends on ICU (http://www.icu-project.org/) + * + */ + +public class ICUTokenizer extends TokenFilter { + private static ICUBreakIterator compiled; + private ICUBreakIterator breaker; + private Token icuToken = null; + + private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; + + /** Set the max allowed token length. Any token longer + * than this is skipped. */ + public void setMaxTokenLength(int length) { + this.maxTokenLength = length; + } + + /** @see #setMaxTokenLength */ + public int getMaxTokenLength() { + return maxTokenLength; + } + + // Creating a new BreakIterator is expensive (compilation of rules) + // Cloning one is cheap. Stash the compiled one away and clone it when necessary. + static { + try { + compiled = new ICUBreakIterator(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * @param input + */ + public ICUTokenizer(TokenStream input) { + super(input); + try { + breaker = (ICUBreakIterator)compiled.clone(); + } catch (Exception e) {}; + } + + /* (non-Javadoc) + * @see org.apache.lucene.analysis.TokenStream#next(org.apache.lucene.analysis.Token) + */ + public Token next(final Token reusableToken) throws IOException { + int posIncr = 1; + + if (icuToken != null) { + int start = breaker.current(); + int end = breaker.next(); + while ((breaker.getRuleStatus() == ICUBreakIterator.WORD_NONE || (end - start) > maxTokenLength) + && end != BreakIterator.DONE) { + if ((end - start) > maxTokenLength) + posIncr++; + start = end; + end = breaker.next(); + } + if (end != BreakIterator.DONE) { + reusableToken.reinit(icuToken, icuToken.termBuffer(), start, end - start); + reusableToken.setStartOffset(icuToken.startOffset()+start); + reusableToken.setEndOffset(icuToken.endOffset()+end); + reusableToken.setType(breaker.getTokenType()); + reusableToken.setPositionIncrement(posIncr); + return reusableToken; + } + icuToken = null; + } + + while (true) { + Token nextToken = input.next(reusableToken); + if (nextToken == null) { + return null; + } + + String text = nextToken.term(); + + icuToken = (Token) nextToken.clone(); + breaker.setText(text); + int start = breaker.current(); + int end = breaker.next(); + while ((breaker.getRuleStatus() == ICUBreakIterator.WORD_NONE || (end - start) > maxTokenLength) + && end != BreakIterator.DONE) { + if ((end - start) > maxTokenLength) + posIncr++; + start = end; + end = breaker.next(); + } + if (end != BreakIterator.DONE) { + nextToken.setTermBuffer(text, start, end - start); + nextToken.setStartOffset(nextToken.startOffset() + start); + nextToken.setEndOffset(nextToken.startOffset() + end); + nextToken.setType(breaker.getTokenType()); + nextToken.setPositionIncrement(posIncr); + return nextToken; + } + } + } +} Index: contrib/analyzers/src/java/org/apache/lucene/analysis/icu/package.html =================================================================== --- contrib/analyzers/src/java/org/apache/lucene/analysis/icu/package.html (revision 0) +++ contrib/analyzers/src/java/org/apache/lucene/analysis/icu/package.html (revision 0) @@ -0,0 +1,5 @@ +
+ +Analyzer based on Unicode word break rules + + Index: contrib/analyzers/src/test/org/apache/lucene/analysis/icu/TestICUAnalyzer.java =================================================================== --- contrib/analyzers/src/test/org/apache/lucene/analysis/icu/TestICUAnalyzer.java (revision 0) +++ contrib/analyzers/src/test/org/apache/lucene/analysis/icu/TestICUAnalyzer.java (revision 0) @@ -0,0 +1,305 @@ +package org.apache.lucene.analysis.icu; + +import java.io.StringReader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +import junit.framework.TestCase; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestICUAnalyzer extends TestCase { + + private Analyzer a = new ICUAnalyzer(); + + public void assertAnalyzesTo(Analyzer a, String input, String[] expected) throws Exception { + assertAnalyzesTo(a, input, expected, null); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception { + assertAnalyzesTo(a, input, expectedImages, expectedTypes, null); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception { + TokenStream ts = a.tokenStream("dummy", new StringReader(input)); + final Token reusableToken = new Token(); + for (int i = 0; i < expectedImages.length; i++) { + Token nextToken = ts.next(reusableToken); + assertNotNull(nextToken); + assertEquals(expectedImages[i], nextToken.term()); + if (expectedTypes != null) { + assertEquals(expectedTypes[i], nextToken.type()); + } + if (expectedPosIncrs != null) { + assertEquals(expectedPosIncrs[i], nextToken.getPositionIncrement()); + } + } + assertNull(ts.next(reusableToken)); + ts.close(); + } + + + public void testMaxTermLength() throws Exception { + ICUAnalyzer sa = new ICUAnalyzer(); + sa.setMaxTokenLength(5); + assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}); + } + + public void testMaxTermLength2() throws Exception { + ICUAnalyzer sa = new ICUAnalyzer(); + assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"}); + sa.setMaxTokenLength(5); + + assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, null, new int[]{1, 1, 2, 1}); + } + + public void testMaxTermLength3() throws Exception { + char[] chars = new char[255]; + for(int i=0;i<255;i++) + chars[i] = 'a'; + String longTerm = new String(chars, 0, 255); + + assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"}); + + /* This test will fail because CharTokenizer has the following + * + * private static final int MAX_WORD_LEN = 255; + * + assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"}); + */ + } + + public void testAlphanumeric() throws Exception { + // alphanumeric tokens + assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); + assertAnalyzesTo(a, "2B", new String[]{"2b"}); + } + + /* + * This test will fail because the Unicode definition says otherwise. + * + * public void testUnderscores() throws Exception { + // underscores are delimiters, but not in email addresses (below) + assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", "having", "underscore"}); + assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new String[]{"word", "underscore", "stopwords"}); + } + */ + + public void testDelimiters() throws Exception { + // other delimiters: "-", "/", "," + assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"}); + assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"}); + assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"}); + } + + public void testApostrophes() throws Exception { + // internal apostrophes: O'Reilly, you're, O'Reilly's + // possessives are actually removed by StardardFilter, not the tokenizer + assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); + assertAnalyzesTo(a, "you're", new String[]{"you're"}); + assertAnalyzesTo(a, "she's", new String[]{"she"}); + assertAnalyzesTo(a, "Jim's", new String[]{"jim"}); + assertAnalyzesTo(a, "don't", new String[]{"don't"}); + assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"}); + } + + public void testTSADash() throws Exception { + // t and s had been stopwords in Lucene <= 2.0, which made it impossible + // to correctly search for these terms: + assertAnalyzesTo(a, "s-class", new String[]{"s", "class"}); + assertAnalyzesTo(a, "t-com", new String[]{"t", "com"}); + // 'a' is still a stopword: + assertAnalyzesTo(a, "a-class", new String[]{"class"}); + } + + public void testCompanyNames() throws Exception { + // company names + assertAnalyzesTo(a, "AT&T", new String[]{"at&t"}); + assertAnalyzesTo(a, "Excite@Home", new String[]{"excite@home"}); + } + + public void testLucene1140() throws Exception { + try { + ICUAnalyzer analyzer = new ICUAnalyzer(); + assertAnalyzesTo(analyzer, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "