Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 904100) +++ CHANGES.txt (working copy) @@ -127,6 +127,11 @@ * LUCENE-2198: Support protected words in stemming TokenFilters using a new KeywordAttribute. (Simon Willnauer via Uwe Schindler) + +* LUCENE-2183: Added Unicode 4 support to CharTokenizer and its subclasses. + CharTokenizer now has new int-API which is conditionally preferred to + the old char-API depending on the provided Version. Version < 3.1 will + use the char-API. (Simon Willnauer via Uwe Schindler) Optimizations Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (revision 904100) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (working copy) @@ -18,8 +18,10 @@ import java.io.Reader; +import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.LetterTokenizer; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Version; /** * Tokenizer that breaks text into runs of letters and diacritics. @@ -27,28 +29,101 @@ * The problem with the standard Letter tokenizer is that it fails on diacritics. * Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc. *

- * + *

+ * + * You must specify the required {@link Version} compatibility when creating + * {@link ArabicLetterTokenizer}: + *

*/ public class ArabicLetterTokenizer extends LetterTokenizer { + + /** + * Construct a new ArabicLetterTokenizer. + * @param matchVersion Lucene version + * to match See {@link above} + * + * @param in + * the input to split up into tokens + */ + public ArabicLetterTokenizer(Version matchVersion, Reader in) { + super(matchVersion, in); + } + + /** + * Construct a new ArabicLetterTokenizer using a given {@link AttributeSource}. + * + * @param matchVersion + * Lucene version to match See {@link above} + * @param source + * the attribute source to use for this Tokenizer + * @param in + * the input to split up into tokens + */ + public ArabicLetterTokenizer(Version matchVersion, AttributeSource source, Reader in) { + super(matchVersion, source, in); + } + + /** + * Construct a new ArabicLetterTokenizer using a given + * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. * @param + * matchVersion Lucene version to match See + * {@link above} + * + * @param factory + * the attribute factory to use for this Tokenizer + * @param in + * the input to split up into tokens + */ + public ArabicLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) { + super(matchVersion, factory, in); + } + + /** + * Construct a new ArabicLetterTokenizer. + * + * @deprecated use {@link #ArabicLetterTokenizer(Version, Reader)} instead. This will + * be removed in Lucene 4.0. + */ + @Deprecated public ArabicLetterTokenizer(Reader in) { super(in); } + /** + * Construct a new ArabicLetterTokenizer using a given {@link AttributeSource}. + * + * @deprecated use {@link #ArabicLetterTokenizer(Version, AttributeSource, Reader)} + * instead. This will be removed in Lucene 4.0. + */ + @Deprecated public ArabicLetterTokenizer(AttributeSource source, Reader in) { super(source, in); } + /** + * Construct a new ArabicLetterTokenizer using a given + * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. + * + * @deprecated use {@link #ArabicLetterTokenizer(Version, AttributeSource.AttributeFactory, Reader)} + * instead. This will be removed in Lucene 4.0. + */ + @Deprecated public ArabicLetterTokenizer(AttributeFactory factory, Reader in) { super(factory, in); } + /** * Allows for Letter category or NonspacingMark category - * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char) + * @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(int) */ @Override - protected boolean isTokenChar(char c) { + protected boolean isTokenChar(int c) { return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK; } Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (revision 904100) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java (working copy) @@ -136,7 +136,7 @@ * @param matchVersion Lucene version to match See * {@link above} * @param stopwords a stopword set - * @param a stemming exclusion set + * @param stemExclusionTable a stemming exclusion set */ public CzechAnalyzer(Version matchVersion, Set stopwords, Set stemExclusionTable) { this.matchVersion = matchVersion; Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (revision 904100) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (working copy) @@ -22,39 +22,107 @@ import org.apache.lucene.analysis.Tokenizer; // for javadocs import org.apache.lucene.analysis.LetterTokenizer; // for javadocs import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Version; /** * A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer} - * by also allowing the basic latin digits 0-9. + * by also allowing the basic Latin digits 0-9. + *

+ * + * You must specify the required {@link Version} compatibility when creating + * {@link RussianLetterTokenizer}: + *

*/ - public class RussianLetterTokenizer extends CharTokenizer { - public RussianLetterTokenizer(Reader in) - { - super(in); + private static final int DIGIT_0 = '0'; + private static final int DIGIT_9 = '9'; + + /** + * Construct a new RussianLetterTokenizer. * @param matchVersion Lucene version + * to match See {@link above} + * + * @param in + * the input to split up into tokens + */ + public RussianLetterTokenizer(Version matchVersion, Reader in) { + super(matchVersion, in); } - public RussianLetterTokenizer(AttributeSource source, Reader in) - { - super(source, in); + /** + * Construct a new RussianLetterTokenizer using a given {@link AttributeSource}. + * + * @param matchVersion + * Lucene version to match See {@link above} + * @param source + * the attribute source to use for this {@link Tokenizer} + * @param in + * the input to split up into tokens + */ + public RussianLetterTokenizer(Version matchVersion, AttributeSource source, Reader in) { + super(matchVersion, source, in); } - public RussianLetterTokenizer(AttributeFactory factory, Reader in) - { - super(factory, in); + /** + * Construct a new RussianLetterTokenizer using a given + * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. * @param + * matchVersion Lucene version to match See + * {@link above} + * + * @param factory + * the attribute factory to use for this {@link Tokenizer} + * @param in + * the input to split up into tokens + */ + public RussianLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) { + super(matchVersion, factory, in); } /** + * Construct a new RussianLetterTokenizer. + * + * @deprecated use {@link #RussianLetterTokenizer(Version, Reader)} instead. This will + * be removed in Lucene 4.0. + */ + @Deprecated + public RussianLetterTokenizer(Reader in) { + super(in); + } + + /** + * Construct a new RussianLetterTokenizer using a given {@link AttributeSource}. + * + * @deprecated use {@link #RussianLetterTokenizer(Version, AttributeSource, Reader)} + * instead. This will be removed in Lucene 4.0. + */ + @Deprecated + public RussianLetterTokenizer(AttributeSource source, Reader in) { + super(source, in); + } + + /** + * Construct a new RussianLetterTokenizer using a given + * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. + * + * @deprecated use {@link #RussianLetterTokenizer(Version, AttributeSource.AttributeFactory, Reader)} + * instead. This will be removed in Lucene 4.0. + */ + @Deprecated + public RussianLetterTokenizer(AttributeFactory factory, Reader in) { + super(factory, in); + } + + + /** * Collects only characters which satisfy - * {@link Character#isLetter(char)}. + * {@link Character#isLetter(int)}. */ @Override - protected boolean isTokenChar(char c) - { - if (Character.isLetter(c) || (c >= '0' && c <= '9')) - return true; - else - return false; + protected boolean isTokenChar(int c) { + return Character.isLetter(c) || (c >= DIGIT_0 && c <= DIGIT_9); } } Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java (revision 0) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java (revision 0) @@ -0,0 +1,45 @@ +package org.apache.lucene.analysis.ar; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.util.Version; + +/** + * Testcase for {@link TestArabicLetterTokenizer} + */ +public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase { + + public void testArabicLetterTokenizer() throws IOException { + StringReader reader = new StringReader("1234567890 Tokenizer \ud801\udc1c\u0300test"); + ArabicLetterTokenizer tokenizer = new ArabicLetterTokenizer(Version.LUCENE_31, + reader); + assertTokenStreamContents(tokenizer, new String[] {"Tokenizer", + "\ud801\udc1c\u0300test"}); + } + + public void testArabicLetterTokenizerBWCompat() throws IOException { + StringReader reader = new StringReader("1234567890 Tokenizer \ud801\udc1c\u0300test"); + ArabicLetterTokenizer tokenizer = new ArabicLetterTokenizer(Version.LUCENE_30, + reader); + assertTokenStreamContents(tokenizer, new String[] {"Tokenizer", "\u0300test"}); + } +} Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\ar\TestArabicLetterTokenizer.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianLetterTokenizer.java =================================================================== --- contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianLetterTokenizer.java (revision 0) +++ contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianLetterTokenizer.java (revision 0) @@ -0,0 +1,45 @@ +package org.apache.lucene.analysis.ru; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.util.Version; + +/** + * Testcase for {@link RussianLetterTokenizer} + */ +public class TestRussianLetterTokenizer extends BaseTokenStreamTestCase { + + public void testRussianLetterTokenizer() throws IOException { + StringReader reader = new StringReader("1234567890 Вместе \ud801\udc1ctest"); + RussianLetterTokenizer tokenizer = new RussianLetterTokenizer(Version.LUCENE_31, + reader); + assertTokenStreamContents(tokenizer, new String[] {"1234567890", "Вместе", + "\ud801\udc1ctest"}); + } + + public void testRussianLetterTokenizerBWCompat() throws IOException { + StringReader reader = new StringReader("1234567890 Вместе \ud801\udc1ctest"); + RussianLetterTokenizer tokenizer = new RussianLetterTokenizer(Version.LUCENE_30, + reader); + assertTokenStreamContents(tokenizer, new String[] {"1234567890", "Вместе", "test"}); + } +} Property changes on: contrib\analyzers\common\src\test\org\apache\lucene\analysis\ru\TestRussianLetterTokenizer.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 904100) +++ src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -23,59 +23,262 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.util.Version; +import org.apache.lucene.util.VirtualMethod; +import org.apache.lucene.util.CharacterUtils.CharacterBuffer; -/** An abstract base class for simple, character-oriented tokenizers.*/ +/** + * An abstract base class for simple, character-oriented tokenizers. + *

+ * You must specify the required {@link Version} compatibility when creating + * {@link CharTokenizer}: + *

+ **/ public abstract class CharTokenizer extends Tokenizer { - public CharTokenizer(Reader input) { + + /** + * Creates a new {@link CharTokenizer} instance + * + * @param matchVersion + * Lucene version to match See {@link
above} + * @param input + * the input to split up into tokens + */ + public CharTokenizer(Version matchVersion, Reader input) { super(input); + charUtils = CharacterUtils.getInstance(matchVersion); offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(TermAttribute.class); + useOldAPI = useOldAPI(matchVersion); + ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); + } - - public CharTokenizer(AttributeSource source, Reader input) { + + /** + * Creates a new {@link CharTokenizer} instance + * + * @param matchVersion + * Lucene version to match See {@link above} + * @param source + * the attribute source to use for this {@link Tokenizer} + * @param input + * the input to split up into tokens + */ + public CharTokenizer(Version matchVersion, AttributeSource source, + Reader input) { super(source, input); + charUtils = CharacterUtils.getInstance(matchVersion); offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(TermAttribute.class); + useOldAPI = useOldAPI(matchVersion); + ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); } - - public CharTokenizer(AttributeFactory factory, Reader input) { + + /** + * Creates a new {@link CharTokenizer} instance + * + * @param matchVersion + * Lucene version to match See {@link above} + * @param factory + * the attribute factory to use for this {@link Tokenizer} + * @param input + * the input to split up into tokens + */ + public CharTokenizer(Version matchVersion, AttributeFactory factory, + Reader input) { super(factory, input); + charUtils = CharacterUtils.getInstance(matchVersion); offsetAtt = addAttribute(OffsetAttribute.class); termAtt = addAttribute(TermAttribute.class); + useOldAPI = useOldAPI(matchVersion); + ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); } + /** + * Creates a new {@link CharTokenizer} instance + * @param input the input to split up into tokens + * @deprecated use {@link #CharTokenizer(Version, Reader)} instead. This will be + * removed in Lucene 4.0. + */ + @Deprecated + public CharTokenizer(Reader input) { + this(Version.LUCENE_30, input); + } + + /** + * Creates a new {@link CharTokenizer} instance + * @param input the input to split up into tokens + * @param source the attribute source to use for this {@link Tokenizer} + * @deprecated use {@link #CharTokenizer(Version, AttributeSource, Reader)} instead. This will be + * removed in Lucene 4.0. + */ + @Deprecated + public CharTokenizer(AttributeSource source, Reader input) { + this(Version.LUCENE_30, source, input); + } + + /** + * Creates a new {@link CharTokenizer} instance + * @param input the input to split up into tokens + * @param factory the attribute factory to use for this {@link Tokenizer} + * @deprecated use {@link #CharTokenizer(Version, AttributeSource.AttributeFactory, Reader)} instead. This will be + * removed in Lucene 4.0. + */ + @Deprecated + public CharTokenizer(AttributeFactory factory, Reader input) { + this(Version.LUCENE_30, factory, input); + } + private int offset = 0, bufferIndex = 0, dataLen = 0; private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 4096; - private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; - private TermAttribute termAtt; - private OffsetAttribute offsetAtt; + private final TermAttribute termAtt; + private final OffsetAttribute offsetAtt; + + private final CharacterUtils charUtils; + private final CharacterBuffer ioBuffer; + + /** + * @deprecated this will be removed in lucene 4.0 + */ + @Deprecated + private final boolean useOldAPI; + + private static final VirtualMethod isTokenCharMethod = + new VirtualMethod(CharTokenizer.class, "isTokenChar", char.class); + + private static final VirtualMethod normalizeMethod = + new VirtualMethod(CharTokenizer.class, "normalize", char.class); + + /** + * Returns true iff a codepoint should be included in a token. This tokenizer + * generates as tokens adjacent sequences of characters which satisfy this + * predicate. Characters for which this is false are used to define token + * boundaries and are not included in tokens. + * @deprecated use {@link #isTokenChar(int)} instead. This method will be + * removed in Lucene 4.0. + */ + @Deprecated + protected boolean isTokenChar(char c) { + return isTokenChar((int)c); + } - /** Returns true iff a character should be included in a token. This - * tokenizer generates as tokens adjacent sequences of characters which - * satisfy this predicate. Characters for which this is false are used to - * define token boundaries and are not included in tokens. */ - protected abstract boolean isTokenChar(char c); + /** + * Called on each token character to normalize it before it is added to the + * token. The default implementation does nothing. Subclasses may use this to, + * e.g., lowercase tokens. + * @deprecated use {@link #normalize(int)} instead. This method will be removed in Lucene 4.0. + */ + @Deprecated + protected char normalize(char c) { + return (char) normalize((int) c); + } + + /** + * Returns true iff a character should be included in a token. This tokenizer + * generates as tokens adjacent sequences of characters which satisfy this + * predicate. Characters for which this is false are used to define token + * boundaries and are not included in tokens. + *

+ * As of Lucene 3.1 the char based API ({@link #isTokenChar(char)} and + * {@link #normalize(char)}) has been depreciated in favor of a Unicode 4.0 + * compatible int based API to support codepoints instead of chars. Subclasses + * of {@link CharTokenizer} must not override the char based methods if a + * {@link Version} >= 3.1 is passed to the constructor. + *

+ *

+ * NOTE: This method will be marked abstract in Lucene 4.0. + *

+ */ + protected boolean isTokenChar(int c) { + throw new UnsupportedOperationException("since LUCENE_3_1 subclasses of CharTokenizer must implement isTokenChar(int)"); + } - /** Called on each token character to normalize it before it is added to the - * token. The default implementation does nothing. Subclasses may use this - * to, e.g., lowercase tokens. */ - protected char normalize(char c) { + /** + * Called on each token character to normalize it before it is added to the + * token. The default implementation does nothing. Subclasses may use this to, + * e.g., lowercase tokens. + *

+ * As of Lucene 3.1 the char based API ({@link #isTokenChar(char)} and + * {@link #normalize(char)}) has been depreciated in favor of a Unicode 4.0 + * compatible int based API to support codepoints instead of chars. Subclasses + * of {@link CharTokenizer} must not override the char based methods if a + * {@link Version} >= 3.1 is passed to the constructor. + *

+ *

+ * NOTE: This method will be marked abstract in Lucene 4.0. + *

+ */ + protected int normalize(int c) { return c; } @Override public final boolean incrementToken() throws IOException { clearAttributes(); + if(useOldAPI) // TODO remove this in LUCENE 4.0 + return incrementTokenOld(); int length = 0; int start = bufferIndex; char[] buffer = termAtt.termBuffer(); while (true) { + if (bufferIndex >= dataLen) { + offset += dataLen; + if(!charUtils.fill(ioBuffer, input)) { + dataLen = 0; // so next offset += dataLen won't decrement offset + if (length > 0) + break; + else + return false; + } + dataLen = ioBuffer.getLength(); + bufferIndex = 0; + } + final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex); + bufferIndex += Character.charCount(c); + + if (isTokenChar(c)) { // if it's a token char + if (length == 0) // start of token + start = offset + bufferIndex - 1; + else if (length >= buffer.length-1) // check if a supplementary could run out of bounds + buffer = termAtt.resizeTermBuffer(2+length); // make sure a supplementary fits in the buffer + length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized + if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test + break; + } else if (length > 0) // at non-Letter w/ chars + break; // return 'em + } + + termAtt.setTermLength(length); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); + return true; + + } + + /** + * The <= 3.0 version of incrementToken. This is a backwards compat implementation used + * if a version <= 3.0 is provided to the ctor. + * @deprecated remove in 4.0 + */ + @Deprecated + private boolean incrementTokenOld() throws IOException { + int length = 0; + int start = bufferIndex; + char[] buffer = termAtt.termBuffer(); + final char[] oldIoBuffer = ioBuffer.getBuffer(); + while (true) { + if (bufferIndex >= dataLen) { offset += dataLen; - dataLen = input.read(ioBuffer); + dataLen = input.read(oldIoBuffer); if (dataLen == -1) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) @@ -86,7 +289,7 @@ bufferIndex = 0; } - final char c = ioBuffer[bufferIndex++]; + final char c = oldIoBuffer[bufferIndex++]; if (isTokenChar(c)) { // if it's a token char @@ -107,8 +310,10 @@ termAtt.setTermLength(length); offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; - } + } + + @Override public final void end() { // set final offset @@ -122,5 +327,15 @@ bufferIndex = 0; offset = 0; dataLen = 0; + ioBuffer.reset(); } -} + + private boolean useOldAPI(Version matchVersion) { + final Class clazz = this.getClass(); + if (matchVersion.onOrAfter(Version.LUCENE_31) + && (isTokenCharMethod.isOverriddenAsOf(clazz) || normalizeMethod + .isOverriddenAsOf(clazz))) throw new IllegalArgumentException( + "For matchVersion >= LUCENE_31, CharTokenizer subclasses must not override isTokenChar(char) or normalize(char)."); + return !matchVersion.onOrAfter(Version.LUCENE_31); + } +} \ No newline at end of file Index: src/java/org/apache/lucene/analysis/LetterTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/LetterTokenizer.java (revision 904100) +++ src/java/org/apache/lucene/analysis/LetterTokenizer.java (working copy) @@ -20,34 +20,106 @@ import java.io.Reader; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Version; -/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's - to say, it defines tokens as maximal strings of adjacent letters, as defined - by java.lang.Character.isLetter() predicate. +/** + * A LetterTokenizer is a tokenizer that divides text at non-letters. That's to + * say, it defines tokens as maximal strings of adjacent letters, as defined by + * java.lang.Character.isLetter() predicate. + *

+ * Note: this does a decent job for most European languages, but does a terrible + * job for some Asian languages, where words are not separated by spaces. + *

+ *

+ * + * You must specify the required {@link Version} compatibility when creating + * {@link LetterTokenizer}: + *

+ *

+ */ - Note: this does a decent job for most European languages, but does a terrible - job for some Asian languages, where words are not separated by spaces. */ - public class LetterTokenizer extends CharTokenizer { - /** Construct a new LetterTokenizer. */ + + /** + * Construct a new LetterTokenizer. + * + * @param matchVersion + * Lucene version to match See {@link
above} + * @param in + * the input to split up into tokens + */ + public LetterTokenizer(Version matchVersion, Reader in) { + super(matchVersion, in); + } + + /** + * Construct a new LetterTokenizer using a given {@link AttributeSource}. + * + * @param matchVersion + * Lucene version to match See {@link above} + * @param source + * the attribute source to use for this {@link Tokenizer} + * @param in + * the input to split up into tokens + */ + public LetterTokenizer(Version matchVersion, AttributeSource source, Reader in) { + super(matchVersion, source, in); + } + + /** + * Construct a new LetterTokenizer using a given + * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. + * + * @param matchVersion + * Lucene version to match See {@link above} + * @param factory + * the attribute factory to use for this {@link Tokenizer} + * @param in + * the input to split up into tokens + */ + public LetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) { + super(matchVersion, factory, in); + } + + /** + * Construct a new LetterTokenizer. + * + * @deprecated use {@link #LetterTokenizer(Version, Reader)} instead. This + * will be removed in Lucene 4.0. + */ public LetterTokenizer(Reader in) { - super(in); + super(Version.LUCENE_30, in); } - /** Construct a new LetterTokenizer using a given {@link AttributeSource}. */ + /** + * Construct a new LetterTokenizer using a given {@link AttributeSource}. + * @deprecated + * use {@link #LetterTokenizer(Version, AttributeSource, Reader)} instead. + * This will be removed in Lucene 4.0. + */ public LetterTokenizer(AttributeSource source, Reader in) { - super(source, in); + super(Version.LUCENE_30, source, in); } - /** Construct a new LetterTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */ + /** + * Construct a new LetterTokenizer using a given + * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. + * + * @deprecated use {@link #LetterTokenizer(Version, AttributeSource.AttributeFactory, Reader)} + * instead. This will be removed in Lucene 4.0. + */ public LetterTokenizer(AttributeFactory factory, Reader in) { - super(factory, in); + super(Version.LUCENE_30, factory, in); } - + /** Collects only characters which satisfy - * {@link Character#isLetter(char)}.*/ + * {@link Character#isLetter(int)}.*/ @Override - protected boolean isTokenChar(char c) { + protected boolean isTokenChar(int c) { return Character.isLetter(c); } } Index: src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java (revision 904100) +++ src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java (working copy) @@ -20,6 +20,7 @@ import java.io.Reader; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Version; /** * LowerCaseTokenizer performs the function of LetterTokenizer @@ -30,27 +31,98 @@ *

* Note: this does a decent job for most European languages, but does a terrible * job for some Asian languages, where words are not separated by spaces. + *

+ *

+ * + * You must specify the required {@link Version} compatibility when creating + * {@link LowerCaseTokenizer}: + *

+ *

*/ public final class LowerCaseTokenizer extends LetterTokenizer { - /** Construct a new LowerCaseTokenizer. */ + + /** + * Construct a new LowerCaseTokenizer. + * + * @param matchVersion + * Lucene version to match See {@link
above} + * + * @param in + * the input to split up into tokens + */ + public LowerCaseTokenizer(Version matchVersion, Reader in) { + super(matchVersion, in); + } + + /** + * Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. + * + * @param matchVersion + * Lucene version to match See {@link above} + * @param source + * the attribute source to use for this {@link Tokenizer} + * @param in + * the input to split up into tokens + */ + public LowerCaseTokenizer(Version matchVersion, AttributeSource source, Reader in) { + super(matchVersion, source, in); + } + + /** + * Construct a new LowerCaseTokenizer using a given + * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. + * + * @param matchVersion + * Lucene version to match See {@link above} + * @param factory + * the attribute factory to use for this {@link Tokenizer} + * @param in + * the input to split up into tokens + */ + public LowerCaseTokenizer(Version matchVersion, AttributeFactory factory, Reader in) { + super(matchVersion, factory, in); + } + + /** + * Construct a new LowerCaseTokenizer. + * + * @deprecated use {@link #LowerCaseTokenizer(Reader)} instead. This will be + * removed in Lucene 4.0. + */ + @Deprecated public LowerCaseTokenizer(Reader in) { - super(in); + super(Version.LUCENE_30, in); } - /** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */ + /** + * Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. + * + * @deprecated use {@link #LowerCaseTokenizer(AttributeSource, Reader)} + * instead. This will be removed in Lucene 4.0. + */ public LowerCaseTokenizer(AttributeSource source, Reader in) { - super(source, in); + super(Version.LUCENE_30, source, in); } - /** Construct a new LowerCaseTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */ + /** + * Construct a new LowerCaseTokenizer using a given + * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. + * + * @deprecated use {@link #LowerCaseTokenizer(AttributeSource.AttributeFactory, Reader)} + * instead. This will be removed in Lucene 4.0. + */ public LowerCaseTokenizer(AttributeFactory factory, Reader in) { - super(factory, in); + super(Version.LUCENE_30, factory, in); } /** Converts char to lower case - * {@link Character#toLowerCase(char)}.*/ + * {@link Character#toLowerCase(int)}.*/ @Override - protected char normalize(char c) { + protected int normalize(int c) { return Character.toLowerCase(c); } } Index: src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java (revision 904100) +++ src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java (working copy) @@ -20,30 +20,101 @@ import java.io.Reader; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Version; -/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace. - * Adjacent sequences of non-Whitespace characters form tokens. */ +/** + * A WhitespaceTokenizer is a tokenizer that divides text at whitespace. + * Adjacent sequences of non-Whitespace characters form tokens. + *

+ * You must specify the required {@link Version} compatibility when creating + * {@link WhitespaceTokenizer}: + *

    + *
  • As of 3.1, {@link CharTokenizer} uses an int based API to normalize and + * detect token characters. See {@link CharTokenizer#isTokenChar(int)} and + * {@link CharTokenizer#normalize(int)} for details.
  • + *
+ */ +public class WhitespaceTokenizer extends CharTokenizer { + + /** + * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version + * to match See {@link
above} + * + * @param in + * the input to split up into tokens + */ + public WhitespaceTokenizer(Version matchVersion, Reader in) { + super(matchVersion, in); + } -public class WhitespaceTokenizer extends CharTokenizer { - /** Construct a new WhitespaceTokenizer. */ + /** + * Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. + * + * @param matchVersion + * Lucene version to match See {@link above} + * @param source + * the attribute source to use for this {@link Tokenizer} + * @param in + * the input to split up into tokens + */ + public WhitespaceTokenizer(Version matchVersion, AttributeSource source, Reader in) { + super(matchVersion, source, in); + } + + /** + * Construct a new WhitespaceTokenizer using a given + * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. * @param + * matchVersion Lucene version to match See + * {@link above} + * + * @param factory + * the attribute factory to use for this {@link Tokenizer} + * @param in + * the input to split up into tokens + */ + public WhitespaceTokenizer(Version matchVersion, AttributeFactory factory, Reader in) { + super(matchVersion, factory, in); + } + + /** + * Construct a new WhitespaceTokenizer. + * + * @deprecated use {@link #WhitespaceTokenizer(Version, Reader)} instead. This will + * be removed in Lucene 4.0. + */ + @Deprecated public WhitespaceTokenizer(Reader in) { super(in); } - /** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */ + /** + * Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. + * + * @deprecated use {@link #WhitespaceTokenizer(Version, AttributeSource, Reader)} + * instead. This will be removed in Lucene 4.0. + */ + @Deprecated public WhitespaceTokenizer(AttributeSource source, Reader in) { super(source, in); } - /** Construct a new WhitespaceTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */ + /** + * Construct a new WhitespaceTokenizer using a given + * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. + * + * @deprecated use {@link #WhitespaceTokenizer(Version, AttributeSource.AttributeFactory, Reader)} + * instead. This will be removed in Lucene 4.0. + */ + @Deprecated public WhitespaceTokenizer(AttributeFactory factory, Reader in) { super(factory, in); } /** Collects only characters which do not satisfy - * {@link Character#isWhitespace(char)}.*/ + * {@link Character#isWhitespace(int)}.*/ @Override - protected boolean isTokenChar(char c) { + protected boolean isTokenChar(int c) { return !Character.isWhitespace(c); } } Index: src/java/org/apache/lucene/util/CharacterUtils.java =================================================================== --- src/java/org/apache/lucene/util/CharacterUtils.java (revision 904100) +++ src/java/org/apache/lucene/util/CharacterUtils.java (working copy) @@ -1,5 +1,8 @@ package org.apache.lucene.util; +import java.io.IOException; +import java.io.Reader; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -104,10 +107,52 @@ * the char array. */ public abstract int codePointAt(final char[] chars, final int offset, final int limit); + + /** + * Creates a new {@link CharacterBuffer} and allocates a char[] + * of the given bufferSize. + * + * @param bufferSize + * the internal char buffer size, must be >= 2 + * @return a new {@link CharacterBuffer} instance. + */ + public static CharacterBuffer newCharacterBuffer(final int bufferSize) { + if(bufferSize < 2) + throw new IllegalArgumentException("buffersize must be >= 2"); + return new CharacterBuffer(new char[bufferSize], 0, 0); + } + /** + * Fills the {@link CharacterBuffer} with characters read from the given + * reader {@link Reader}. This method tries to read as many characters into + * the {@link CharacterBuffer} as possible, each call to fill will start + * filling the buffer from offset 0 up to the length of the size + * of the internal character array. + *

+ * Depending on the {@link Version} passed to + * {@link CharacterUtils#getInstance(Version)} this method implements + * supplementary character awareness when filling the given buffer. For all + * {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader)} guarantees + * that the given {@link CharacterBuffer} will never contain a high surrogate + * character as the last element in the buffer unless it is the last available + * character in the reader. In other words, high and low surrogate pairs will + * always be preserved across buffer boarders. + *

+ * + * @param buffer + * the buffer to fill. + * @param reader + * the reader to read characters from. + * @return true if and only if no more characters are available + * in the reader, otherwise false. + * @throws IOException + * if the reader throws an {@link IOException}. + */ + public abstract boolean fill(CharacterBuffer buffer, Reader reader) throws IOException; + private static final class Java5CharacterUtils extends CharacterUtils { Java5CharacterUtils() { - }; + } @Override public final int codePointAt(final char[] chars, final int offset) { @@ -124,12 +169,32 @@ return Character.codePointAt(chars, offset, limit); } - + @Override + public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException { + final char[] charBuffer = buffer.buffer; + buffer.offset = 0; + charBuffer[0] = buffer.lastTrailingHighSurrogate; + final int offset = buffer.lastTrailingHighSurrogate == 0 ? 0 : 1; + buffer.lastTrailingHighSurrogate = 0; + final int read = reader.read(charBuffer, offset, charBuffer.length + - offset); + if (read == -1) { + buffer.length = offset; + return offset != 0; + } + buffer.length = read + offset; + // special case if the read returns 0 and the lastTrailingHighSurrogate was set + if (buffer.length > 1 + && Character.isHighSurrogate(charBuffer[buffer.length - 1])) { + buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length]; + } + return true; + } } private static final class Java4CharacterUtils extends CharacterUtils { Java4CharacterUtils() { - }; + } @Override public final int codePointAt(final char[] chars, final int offset) { @@ -148,6 +213,72 @@ return chars[offset]; } + @Override + public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException { + buffer.offset = 0; + final int read = reader.read(buffer.buffer); + if(read == -1) + return false; + buffer.length = read; + return true; + } + } + + /** + * A simple IO buffer to use with + * {@link CharacterUtils#fill(CharacterBuffer, Reader)}. + */ + public static final class CharacterBuffer { + + private final char[] buffer; + private int offset; + private int length; + private char lastTrailingHighSurrogate = 0; + + CharacterBuffer(char[] buffer, int offset, int length) { + this.buffer = buffer; + this.offset = offset; + this.length = length; + } + + /** + * Returns the internal buffer + * + * @return the buffer + */ + public char[] getBuffer() { + return buffer; + } + + /** + * Returns the data offset in the internal buffer. + * + * @return the offset + */ + public int getOffset() { + return offset; + } + + /** + * Return the length of the data in the internal buffer starting at + * {@link #getOffset()} + * + * @return the length + */ + public int getLength() { + return length; + } + + /** + * Resets the CharacterBuffer. All internals are reset to its default + * values. + */ + public void reset() { + offset = 0; + length = 0; + lastTrailingHighSurrogate = 0; + } + } } Index: src/test/org/apache/lucene/analysis/TestCharTokenizers.java =================================================================== --- src/test/org/apache/lucene/analysis/TestCharTokenizers.java (revision 0) +++ src/test/org/apache/lucene/analysis/TestCharTokenizers.java (revision 0) @@ -0,0 +1,223 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.Random; + +import org.apache.lucene.util.Version; + +/** + * Testcase for {@link CharTokenizer} subclasses + */ +public class TestCharTokenizers extends BaseTokenStreamTestCase { + + /* + * test to read surrogate pairs without loosing the pairing + * if the surrogate pair is at the border of the internal IO buffer + */ + public void testReadSupplementaryChars() throws IOException { + StringBuilder builder = new StringBuilder(); + Random newRandom = newRandom(); + // create random input + int num = 1024 + newRandom.nextInt(1024); + for (int i = 1; i < num; i++) { + builder.append("\ud801\udc1cabc"); + if((i % 10) == 0) + builder.append(" "); + } + // internal buffer size is 1024 make sure we have a surrogate pair right at the border + builder.insert(1023, "\ud801\udc1c"); + LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( + Version.LUCENE_CURRENT, new StringReader(builder.toString())); + assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" ")); + } + + /* + * test to extend the buffer TermAttribute buffer internally. If the internal + * alg that extends the size of the char array only extends by 1 char and the + * next char to be filled in is a supplementary codepoint (using 2 chars) an + * index out of bound exception is triggered. + */ + public void testExtendCharBuffer() throws IOException { + for (int i = 0; i < 40; i++) { + StringBuilder builder = new StringBuilder(); + for (int j = 0; j < 1+i; j++) { + builder.append("a"); + } + builder.append("\ud801\udc1cabc"); + LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( + Version.LUCENE_CURRENT, new StringReader(builder.toString())); + assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()}); + } + } + + /* + * tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens + */ + public void testMaxWordLength() throws IOException { + StringBuilder builder = new StringBuilder(); + + for (int i = 0; i < 255; i++) { + builder.append("A"); + } + LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( + Version.LUCENE_CURRENT, new StringReader(builder.toString() + builder.toString())); + assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); + } + + /* + * tests the max word length of 255 with a surrogate pair at position 255 + */ + public void testMaxWordLengthWithSupplementary() throws IOException { + StringBuilder builder = new StringBuilder(); + + for (int i = 0; i < 254; i++) { + builder.append("A"); + } + builder.append("\ud801\udc1c"); + LowerCaseTokenizer tokenizer = new LowerCaseTokenizer( + Version.LUCENE_CURRENT, new StringReader(builder.toString() + builder.toString())); + assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); + } + + public void testLowerCaseTokenizer() throws IOException { + StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); + LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_31, + reader); + assertTokenStreamContents(tokenizer, new String[] { "tokenizer", + "\ud801\udc44test" }); + } + + public void testLowerCaseTokenizerBWCompat() throws IOException { + StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); + LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_30, + reader); + assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test" }); + } + + public void testWhitespaceTokenizer() throws IOException { + StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); + WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_31, + reader); + assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", + "\ud801\udc1ctest" }); + } + + public void testWhitespaceTokenizerBWCompat() throws IOException { + StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest"); + WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30, + reader); + assertTokenStreamContents(tokenizer, new String[] { "Tokenizer", + "\ud801\udc1ctest" }); + } + + public void testIsTokenCharCharInSubclass() { + new TestingCharTokenizer(Version.LUCENE_30, new StringReader("")); + try { + new TestingCharTokenizer(Version.LUCENE_CURRENT, new StringReader("")); + fail("version 3.1 is not permitted if char based method is implemented"); + } catch (IllegalArgumentException e) { + // expected + } + } + + public void testNormalizeCharInSubclass() { + new TestingCharTokenizerNormalize(Version.LUCENE_30, new StringReader("")); + try { + new TestingCharTokenizerNormalize(Version.LUCENE_CURRENT, + new StringReader("")); + fail("version 3.1 is not permitted if char based method is implemented"); + } catch (IllegalArgumentException e) { + // expected + } + } + + public void testNormalizeAndIsTokenCharCharInSubclass() { + new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_30, + new StringReader("")); + try { + new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_CURRENT, + new StringReader("")); + fail("version 3.1 is not permitted if char based method is implemented"); + } catch (IllegalArgumentException e) { + // expected + } + } + + static class TestingCharTokenizer extends CharTokenizer { + public TestingCharTokenizer(Version matchVersion, Reader input) { + super(matchVersion, input); + } + + @Override + protected boolean isTokenChar(int c) { + return Character.isLetter(c); + } + + @Override + protected boolean isTokenChar(char c) { + return Character.isLetter(c); + } + } + + static class TestingCharTokenizerNormalize extends CharTokenizer { + public TestingCharTokenizerNormalize(Version matchVersion, Reader input) { + super(matchVersion, input); + } + + @Override + protected char normalize(char c) { + return c; + } + + @Override + protected int normalize(int c) { + return c; + } + } + + static class TestingCharTokenizerNormalizeIsTokenChar extends CharTokenizer { + public TestingCharTokenizerNormalizeIsTokenChar(Version matchVersion, + Reader input) { + super(matchVersion, input); + } + + @Override + protected char normalize(char c) { + return c; + } + + @Override + protected int normalize(int c) { + return c; + } + + @Override + protected boolean isTokenChar(int c) { + return Character.isLetter(c); + } + + @Override + protected boolean isTokenChar(char c) { + return Character.isLetter(c); + } + } +} Property changes on: src\test\org\apache\lucene\analysis\TestCharTokenizers.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: src/test/org/apache/lucene/util/TestCharacterUtils.java =================================================================== --- src/test/org/apache/lucene/util/TestCharacterUtils.java (revision 0) +++ src/test/org/apache/lucene/util/TestCharacterUtils.java (revision 0) @@ -0,0 +1,194 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.util.CharacterUtils.CharacterBuffer; +import org.junit.Test; + +/** + * TestCase for the {@link CharacterUtils} class. + */ +public class TestCharacterUtils { + + @Test + public void testCodePointAtCharArrayInt() { + CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30); + char[] cpAt3 = "Abc\ud801\udc1c".toCharArray(); + char[] highSurrogateAt3 = "Abc\ud801".toCharArray(); + assertEquals((int) 'A', java4.codePointAt(cpAt3, 0)); + assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3)); + assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3)); + try { + java4.codePointAt(highSurrogateAt3, 4); + fail("array index out of bounds"); + } catch (ArrayIndexOutOfBoundsException e) { + } + + CharacterUtils java5 = CharacterUtils.getInstance(Version.LUCENE_31); + assertEquals((int) 'A', java5.codePointAt(cpAt3, 0)); + assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt( + cpAt3, 3)); + assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3)); + try { + java5.codePointAt(highSurrogateAt3, 4); + fail("array index out of bounds"); + } catch (ArrayIndexOutOfBoundsException e) { + } + } + + @Test + public void testCodePointAtCharSequenceInt() { + CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30); + String cpAt3 = "Abc\ud801\udc1c"; + String highSurrogateAt3 = "Abc\ud801"; + assertEquals((int) 'A', java4.codePointAt(cpAt3, 0)); + assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3)); + assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3)); + try { + java4.codePointAt(highSurrogateAt3, 4); + fail("string index out of bounds"); + } catch (StringIndexOutOfBoundsException e) { + } + + CharacterUtils java5 = CharacterUtils.getInstance(Version.LUCENE_31); + assertEquals((int) 'A', java5.codePointAt(cpAt3, 0)); + assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt( + cpAt3, 3)); + assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3)); + try { + java5.codePointAt(highSurrogateAt3, 4); + fail("string index out of bounds"); + } catch (StringIndexOutOfBoundsException e) { + } + + } + + @Test + public void testCodePointAtCharArrayIntInt() { + CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30); + char[] cpAt3 = "Abc\ud801\udc1c".toCharArray(); + char[] highSurrogateAt3 = "Abc\ud801".toCharArray(); + assertEquals((int) 'A', java4.codePointAt(cpAt3, 0, 2)); + assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3, 5)); + assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3, 4)); + + CharacterUtils java5 = CharacterUtils.getInstance(Version.LUCENE_31); + assertEquals((int) 'A', java5.codePointAt(cpAt3, 0, 2)); + assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt( + cpAt3, 3, 5)); + assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3, 4)); + + } + + @Test + public void testNewCharacterBuffer() { + CharacterBuffer newCharacterBuffer = CharacterUtils.newCharacterBuffer(1024); + assertEquals(1024, newCharacterBuffer.getBuffer().length); + assertEquals(0, newCharacterBuffer.getOffset()); + assertEquals(0, newCharacterBuffer.getLength()); + + newCharacterBuffer = CharacterUtils.newCharacterBuffer(2); + assertEquals(2, newCharacterBuffer.getBuffer().length); + assertEquals(0, newCharacterBuffer.getOffset()); + assertEquals(0, newCharacterBuffer.getLength()); + + try { + newCharacterBuffer = CharacterUtils.newCharacterBuffer(1); + fail("length must be >= 2"); + } catch (IllegalArgumentException e) { + } + } + + @Test + public void testFillNoHighSurrogate() throws IOException { + Version[] versions = new Version[] { Version.LUCENE_30, Version.LUCENE_31 }; + for (Version version : versions) { + CharacterUtils instance = CharacterUtils.getInstance(version); + Reader reader = new StringReader("helloworld"); + CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6); + assertTrue(instance.fill(buffer,reader)); + assertEquals(0, buffer.getOffset()); + assertEquals(6, buffer.getLength()); + assertEquals("hellow", new String(buffer.getBuffer())); + assertTrue(instance.fill(buffer,reader)); + assertEquals(4, buffer.getLength()); + assertEquals(0, buffer.getOffset()); + + assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(), + buffer.getLength())); + assertFalse(instance.fill(buffer,reader)); + } + } + + @Test + public void testFillJava15() throws IOException { + String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801"; + CharacterUtils instance = CharacterUtils.getInstance(Version.LUCENE_31); + Reader reader = new StringReader(input); + CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5); + assertTrue(instance.fill(buffer, reader)); + assertEquals(4, buffer.getLength()); + assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(), + buffer.getLength())); + assertTrue(instance.fill(buffer, reader)); + assertEquals(5, buffer.getLength()); + assertEquals("\ud801\udc1c789", new String(buffer.getBuffer())); + assertTrue(instance.fill(buffer, reader)); + assertEquals(4, buffer.getLength()); + assertEquals("123\ud801", new String(buffer.getBuffer(), + buffer.getOffset(), buffer.getLength())); + assertTrue(instance.fill(buffer, reader)); + assertEquals(2, buffer.getLength()); + assertEquals("\ud801\udc1c", new String(buffer.getBuffer(), buffer + .getOffset(), buffer.getLength())); + assertTrue(instance.fill(buffer, reader)); + assertEquals(1, buffer.getLength()); + assertEquals("\ud801", new String(buffer.getBuffer(), buffer + .getOffset(), buffer.getLength())); + assertFalse(instance.fill(buffer, reader)); + } + + @Test + public void testFillJava14() throws IOException { + String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801"; + CharacterUtils instance = CharacterUtils.getInstance(Version.LUCENE_30); + Reader reader = new StringReader(input); + CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5); + assertTrue(instance.fill(buffer, reader)); + assertEquals(5, buffer.getLength()); + assertEquals("1234\ud801", new String(buffer.getBuffer(), buffer + .getOffset(), buffer.getLength())); + assertTrue(instance.fill(buffer, reader)); + assertEquals(5, buffer.getLength()); + assertEquals("\udc1c7891", new String(buffer.getBuffer())); + buffer = CharacterUtils.newCharacterBuffer(6); + assertTrue(instance.fill(buffer, reader)); + assertEquals(6, buffer.getLength()); + assertEquals("23\ud801\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer + .getOffset(), buffer.getLength())); + assertFalse(instance.fill(buffer, reader)); + + } + +} Property changes on: src\test\org\apache\lucene\util\TestCharacterUtils.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native