Index: CHANGES.txt =================================================================== --- CHANGES.txt (revision 904100) +++ CHANGES.txt (working copy) @@ -127,6 +127,11 @@ * LUCENE-2198: Support protected words in stemming TokenFilters using a new KeywordAttribute. (Simon Willnauer via Uwe Schindler) + +* LUCENE-2183: Added Unicode 4 support to CharTokenizer and its subclasses. + CharTokenizer now has new int-API which is conditionally preferred to + the old char-API depending on the provided Version. Version < 3.1 will + use the char-API. (Simon Willnauer via Uwe Schindler) Optimizations Index: contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java =================================================================== --- contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (revision 904100) +++ contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (working copy) @@ -18,8 +18,10 @@ import java.io.Reader; +import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.LetterTokenizer; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Version; /** * Tokenizer that breaks text into runs of letters and diacritics. @@ -27,28 +29,101 @@ * The problem with the standard Letter tokenizer is that it fails on diacritics. * Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc. *
- * + *+ * + * You must specify the required {@link Version} compatibility when creating + * {@link ArabicLetterTokenizer}: + *
+ * + * You must specify the required {@link Version} compatibility when creating + * {@link RussianLetterTokenizer}: + *
+ * You must specify the required {@link Version} compatibility when creating + * {@link CharTokenizer}: + *
+ * As of Lucene 3.1 the char based API ({@link #isTokenChar(char)} and + * {@link #normalize(char)}) has been depreciated in favor of a Unicode 4.0 + * compatible int based API to support codepoints instead of chars. Subclasses + * of {@link CharTokenizer} must not override the char based methods if a + * {@link Version} >= 3.1 is passed to the constructor. + *
+ *
+ * NOTE: This method will be marked abstract in Lucene 4.0. + *
+ */ + protected boolean isTokenChar(int c) { + throw new UnsupportedOperationException("since LUCENE_3_1 subclasses of CharTokenizer must implement isTokenChar(int)"); + } - /** Called on each token character to normalize it before it is added to the - * token. The default implementation does nothing. Subclasses may use this - * to, e.g., lowercase tokens. */ - protected char normalize(char c) { + /** + * Called on each token character to normalize it before it is added to the + * token. The default implementation does nothing. Subclasses may use this to, + * e.g., lowercase tokens. + *+ * As of Lucene 3.1 the char based API ({@link #isTokenChar(char)} and + * {@link #normalize(char)}) has been depreciated in favor of a Unicode 4.0 + * compatible int based API to support codepoints instead of chars. Subclasses + * of {@link CharTokenizer} must not override the char based methods if a + * {@link Version} >= 3.1 is passed to the constructor. + *
+ *
+ * NOTE: This method will be marked abstract in Lucene 4.0. + *
+ */ + protected int normalize(int c) { return c; } @Override public final boolean incrementToken() throws IOException { clearAttributes(); + if(useOldAPI) // TODO remove this in LUCENE 4.0 + return incrementTokenOld(); int length = 0; int start = bufferIndex; char[] buffer = termAtt.termBuffer(); while (true) { + if (bufferIndex >= dataLen) { + offset += dataLen; + if(!charUtils.fill(ioBuffer, input)) { + dataLen = 0; // so next offset += dataLen won't decrement offset + if (length > 0) + break; + else + return false; + } + dataLen = ioBuffer.getLength(); + bufferIndex = 0; + } + final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex); + bufferIndex += Character.charCount(c); + + if (isTokenChar(c)) { // if it's a token char + if (length == 0) // start of token + start = offset + bufferIndex - 1; + else if (length >= buffer.length-1) // check if a supplementary could run out of bounds + buffer = termAtt.resizeTermBuffer(2+length); // make sure a supplementary fits in the buffer + length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized + if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test + break; + } else if (length > 0) // at non-Letter w/ chars + break; // return 'em + } + + termAtt.setTermLength(length); + offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); + return true; + + } + + /** + * The <= 3.0 version of incrementToken. This is a backwards compat implementation used + * if a version <= 3.0 is provided to the ctor. + * @deprecated remove in 4.0 + */ + @Deprecated + private boolean incrementTokenOld() throws IOException { + int length = 0; + int start = bufferIndex; + char[] buffer = termAtt.termBuffer(); + final char[] oldIoBuffer = ioBuffer.getBuffer(); + while (true) { + if (bufferIndex >= dataLen) { offset += dataLen; - dataLen = input.read(ioBuffer); + dataLen = input.read(oldIoBuffer); if (dataLen == -1) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) @@ -86,7 +289,7 @@ bufferIndex = 0; } - final char c = ioBuffer[bufferIndex++]; + final char c = oldIoBuffer[bufferIndex++]; if (isTokenChar(c)) { // if it's a token char @@ -107,8 +310,10 @@ termAtt.setTermLength(length); offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; - } + } + + @Override public final void end() { // set final offset @@ -122,5 +327,15 @@ bufferIndex = 0; offset = 0; dataLen = 0; + ioBuffer.reset(); } -} + + private boolean useOldAPI(Version matchVersion) { + final Class extends CharTokenizer> clazz = this.getClass(); + if (matchVersion.onOrAfter(Version.LUCENE_31) + && (isTokenCharMethod.isOverriddenAsOf(clazz) || normalizeMethod + .isOverriddenAsOf(clazz))) throw new IllegalArgumentException( + "For matchVersion >= LUCENE_31, CharTokenizer subclasses must not override isTokenChar(char) or normalize(char)."); + return !matchVersion.onOrAfter(Version.LUCENE_31); + } +} \ No newline at end of file Index: src/java/org/apache/lucene/analysis/LetterTokenizer.java =================================================================== --- src/java/org/apache/lucene/analysis/LetterTokenizer.java (revision 904100) +++ src/java/org/apache/lucene/analysis/LetterTokenizer.java (working copy) @@ -20,34 +20,106 @@ import java.io.Reader; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Version; -/** A LetterTokenizer is a tokenizer that divides text at non-letters. That's - to say, it defines tokens as maximal strings of adjacent letters, as defined - by java.lang.Character.isLetter() predicate. +/** + * A LetterTokenizer is a tokenizer that divides text at non-letters. That's to + * say, it defines tokens as maximal strings of adjacent letters, as defined by + * java.lang.Character.isLetter() predicate. + *+ * Note: this does a decent job for most European languages, but does a terrible + * job for some Asian languages, where words are not separated by spaces. + *
+ *+ * + * You must specify the required {@link Version} compatibility when creating + * {@link LetterTokenizer}: + *
* Note: this does a decent job for most European languages, but does a terrible * job for some Asian languages, where words are not separated by spaces. + *
+ *+ * + * You must specify the required {@link Version} compatibility when creating + * {@link LowerCaseTokenizer}: + *
+ * You must specify the required {@link Version} compatibility when creating + * {@link WhitespaceTokenizer}: + *
char[]
+ * of the given bufferSize.
+ *
+ * @param bufferSize
+ * the internal char buffer size, must be >= 2
+ * @return a new {@link CharacterBuffer} instance.
+ */
+ public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
+ if(bufferSize < 2)
+ throw new IllegalArgumentException("buffersize must be >= 2");
+ return new CharacterBuffer(new char[bufferSize], 0, 0);
+ }
+ /**
+ * Fills the {@link CharacterBuffer} with characters read from the given
+ * reader {@link Reader}. This method tries to read as many characters into
+ * the {@link CharacterBuffer} as possible, each call to fill will start
+ * filling the buffer from offset 0 up to the length of the size
+ * of the internal character array.
+ * + * Depending on the {@link Version} passed to + * {@link CharacterUtils#getInstance(Version)} this method implements + * supplementary character awareness when filling the given buffer. For all + * {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader)} guarantees + * that the given {@link CharacterBuffer} will never contain a high surrogate + * character as the last element in the buffer unless it is the last available + * character in the reader. In other words, high and low surrogate pairs will + * always be preserved across buffer boarders. + *
+ * + * @param buffer + * the buffer to fill. + * @param reader + * the reader to read characters from. + * @returntrue if and only if no more characters are available
+ * in the reader, otherwise false.
+ * @throws IOException
+ * if the reader throws an {@link IOException}.
+ */
+ public abstract boolean fill(CharacterBuffer buffer, Reader reader) throws IOException;
+
private static final class Java5CharacterUtils extends CharacterUtils {
Java5CharacterUtils() {
- };
+ }
@Override
public final int codePointAt(final char[] chars, final int offset) {
@@ -124,12 +169,32 @@
return Character.codePointAt(chars, offset, limit);
}
-
+ @Override
+ public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
+ final char[] charBuffer = buffer.buffer;
+ buffer.offset = 0;
+ charBuffer[0] = buffer.lastTrailingHighSurrogate;
+ final int offset = buffer.lastTrailingHighSurrogate == 0 ? 0 : 1;
+ buffer.lastTrailingHighSurrogate = 0;
+ final int read = reader.read(charBuffer, offset, charBuffer.length
+ - offset);
+ if (read == -1) {
+ buffer.length = offset;
+ return offset != 0;
+ }
+ buffer.length = read + offset;
+ // special case if the read returns 0 and the lastTrailingHighSurrogate was set
+ if (buffer.length > 1
+ && Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+ buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+ }
+ return true;
+ }
}
private static final class Java4CharacterUtils extends CharacterUtils {
Java4CharacterUtils() {
- };
+ }
@Override
public final int codePointAt(final char[] chars, final int offset) {
@@ -148,6 +213,72 @@
return chars[offset];
}
+ @Override
+ public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
+ buffer.offset = 0;
+ final int read = reader.read(buffer.buffer);
+ if(read == -1)
+ return false;
+ buffer.length = read;
+ return true;
+ }
+
}
+
+ /**
+ * A simple IO buffer to use with
+ * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
+ */
+ public static final class CharacterBuffer {
+
+ private final char[] buffer;
+ private int offset;
+ private int length;
+ private char lastTrailingHighSurrogate = 0;
+
+ CharacterBuffer(char[] buffer, int offset, int length) {
+ this.buffer = buffer;
+ this.offset = offset;
+ this.length = length;
+ }
+
+ /**
+ * Returns the internal buffer
+ *
+ * @return the buffer
+ */
+ public char[] getBuffer() {
+ return buffer;
+ }
+
+ /**
+ * Returns the data offset in the internal buffer.
+ *
+ * @return the offset
+ */
+ public int getOffset() {
+ return offset;
+ }
+
+ /**
+ * Return the length of the data in the internal buffer starting at
+ * {@link #getOffset()}
+ *
+ * @return the length
+ */
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Resets the CharacterBuffer. All internals are reset to its default
+ * values.
+ */
+ public void reset() {
+ offset = 0;
+ length = 0;
+ lastTrailingHighSurrogate = 0;
+ }
+ }
}
Index: src/test/org/apache/lucene/analysis/TestCharTokenizers.java
===================================================================
--- src/test/org/apache/lucene/analysis/TestCharTokenizers.java (revision 0)
+++ src/test/org/apache/lucene/analysis/TestCharTokenizers.java (revision 0)
@@ -0,0 +1,223 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Random;
+
+import org.apache.lucene.util.Version;
+
+/**
+ * Testcase for {@link CharTokenizer} subclasses
+ */
+public class TestCharTokenizers extends BaseTokenStreamTestCase {
+
+ /*
+ * test to read surrogate pairs without loosing the pairing
+ * if the surrogate pair is at the border of the internal IO buffer
+ */
+ public void testReadSupplementaryChars() throws IOException {
+ StringBuilder builder = new StringBuilder();
+ Random newRandom = newRandom();
+ // create random input
+ int num = 1024 + newRandom.nextInt(1024);
+ for (int i = 1; i < num; i++) {
+ builder.append("\ud801\udc1cabc");
+ if((i % 10) == 0)
+ builder.append(" ");
+ }
+ // internal buffer size is 1024 make sure we have a surrogate pair right at the border
+ builder.insert(1023, "\ud801\udc1c");
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
+ Version.LUCENE_CURRENT, new StringReader(builder.toString()));
+ assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" "));
+ }
+
+ /*
+ * test to extend the buffer TermAttribute buffer internally. If the internal
+ * alg that extends the size of the char array only extends by 1 char and the
+ * next char to be filled in is a supplementary codepoint (using 2 chars) an
+ * index out of bound exception is triggered.
+ */
+ public void testExtendCharBuffer() throws IOException {
+ for (int i = 0; i < 40; i++) {
+ StringBuilder builder = new StringBuilder();
+ for (int j = 0; j < 1+i; j++) {
+ builder.append("a");
+ }
+ builder.append("\ud801\udc1cabc");
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
+ Version.LUCENE_CURRENT, new StringReader(builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()});
+ }
+ }
+
+ /*
+ * tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens
+ */
+ public void testMaxWordLength() throws IOException {
+ StringBuilder builder = new StringBuilder();
+
+ for (int i = 0; i < 255; i++) {
+ builder.append("A");
+ }
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
+ Version.LUCENE_CURRENT, new StringReader(builder.toString() + builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
+ }
+
+ /*
+ * tests the max word length of 255 with a surrogate pair at position 255
+ */
+ public void testMaxWordLengthWithSupplementary() throws IOException {
+ StringBuilder builder = new StringBuilder();
+
+ for (int i = 0; i < 254; i++) {
+ builder.append("A");
+ }
+ builder.append("\ud801\udc1c");
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(
+ Version.LUCENE_CURRENT, new StringReader(builder.toString() + builder.toString()));
+ assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()});
+ }
+
+ public void testLowerCaseTokenizer() throws IOException {
+ StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_31,
+ reader);
+ assertTokenStreamContents(tokenizer, new String[] { "tokenizer",
+ "\ud801\udc44test" });
+ }
+
+ public void testLowerCaseTokenizerBWCompat() throws IOException {
+ StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+ LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_30,
+ reader);
+ assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test" });
+ }
+
+ public void testWhitespaceTokenizer() throws IOException {
+ StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_31,
+ reader);
+ assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
+ "\ud801\udc1ctest" });
+ }
+
+ public void testWhitespaceTokenizerBWCompat() throws IOException {
+ StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
+ WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30,
+ reader);
+ assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
+ "\ud801\udc1ctest" });
+ }
+
+ public void testIsTokenCharCharInSubclass() {
+ new TestingCharTokenizer(Version.LUCENE_30, new StringReader(""));
+ try {
+ new TestingCharTokenizer(Version.LUCENE_CURRENT, new StringReader(""));
+ fail("version 3.1 is not permitted if char based method is implemented");
+ } catch (IllegalArgumentException e) {
+ // expected
+ }
+ }
+
+ public void testNormalizeCharInSubclass() {
+ new TestingCharTokenizerNormalize(Version.LUCENE_30, new StringReader(""));
+ try {
+ new TestingCharTokenizerNormalize(Version.LUCENE_CURRENT,
+ new StringReader(""));
+ fail("version 3.1 is not permitted if char based method is implemented");
+ } catch (IllegalArgumentException e) {
+ // expected
+ }
+ }
+
+ public void testNormalizeAndIsTokenCharCharInSubclass() {
+ new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_30,
+ new StringReader(""));
+ try {
+ new TestingCharTokenizerNormalizeIsTokenChar(Version.LUCENE_CURRENT,
+ new StringReader(""));
+ fail("version 3.1 is not permitted if char based method is implemented");
+ } catch (IllegalArgumentException e) {
+ // expected
+ }
+ }
+
+ static class TestingCharTokenizer extends CharTokenizer {
+ public TestingCharTokenizer(Version matchVersion, Reader input) {
+ super(matchVersion, input);
+ }
+
+ @Override
+ protected boolean isTokenChar(int c) {
+ return Character.isLetter(c);
+ }
+
+ @Override
+ protected boolean isTokenChar(char c) {
+ return Character.isLetter(c);
+ }
+ }
+
+ static class TestingCharTokenizerNormalize extends CharTokenizer {
+ public TestingCharTokenizerNormalize(Version matchVersion, Reader input) {
+ super(matchVersion, input);
+ }
+
+ @Override
+ protected char normalize(char c) {
+ return c;
+ }
+
+ @Override
+ protected int normalize(int c) {
+ return c;
+ }
+ }
+
+ static class TestingCharTokenizerNormalizeIsTokenChar extends CharTokenizer {
+ public TestingCharTokenizerNormalizeIsTokenChar(Version matchVersion,
+ Reader input) {
+ super(matchVersion, input);
+ }
+
+ @Override
+ protected char normalize(char c) {
+ return c;
+ }
+
+ @Override
+ protected int normalize(int c) {
+ return c;
+ }
+
+ @Override
+ protected boolean isTokenChar(int c) {
+ return Character.isLetter(c);
+ }
+
+ @Override
+ protected boolean isTokenChar(char c) {
+ return Character.isLetter(c);
+ }
+ }
+}
Property changes on: src\test\org\apache\lucene\analysis\TestCharTokenizers.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native
Index: src/test/org/apache/lucene/util/TestCharacterUtils.java
===================================================================
--- src/test/org/apache/lucene/util/TestCharacterUtils.java (revision 0)
+++ src/test/org/apache/lucene/util/TestCharacterUtils.java (revision 0)
@@ -0,0 +1,194 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.util.CharacterUtils.CharacterBuffer;
+import org.junit.Test;
+
+/**
+ * TestCase for the {@link CharacterUtils} class.
+ */
+public class TestCharacterUtils {
+
+ @Test
+ public void testCodePointAtCharArrayInt() {
+ CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30);
+ char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
+ char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
+ assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
+ assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3));
+ assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3));
+ try {
+ java4.codePointAt(highSurrogateAt3, 4);
+ fail("array index out of bounds");
+ } catch (ArrayIndexOutOfBoundsException e) {
+ }
+
+ CharacterUtils java5 = CharacterUtils.getInstance(Version.LUCENE_31);
+ assertEquals((int) 'A', java5.codePointAt(cpAt3, 0));
+ assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
+ cpAt3, 3));
+ assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3));
+ try {
+ java5.codePointAt(highSurrogateAt3, 4);
+ fail("array index out of bounds");
+ } catch (ArrayIndexOutOfBoundsException e) {
+ }
+ }
+
+ @Test
+ public void testCodePointAtCharSequenceInt() {
+ CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30);
+ String cpAt3 = "Abc\ud801\udc1c";
+ String highSurrogateAt3 = "Abc\ud801";
+ assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
+ assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3));
+ assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3));
+ try {
+ java4.codePointAt(highSurrogateAt3, 4);
+ fail("string index out of bounds");
+ } catch (StringIndexOutOfBoundsException e) {
+ }
+
+ CharacterUtils java5 = CharacterUtils.getInstance(Version.LUCENE_31);
+ assertEquals((int) 'A', java5.codePointAt(cpAt3, 0));
+ assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
+ cpAt3, 3));
+ assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3));
+ try {
+ java5.codePointAt(highSurrogateAt3, 4);
+ fail("string index out of bounds");
+ } catch (StringIndexOutOfBoundsException e) {
+ }
+
+ }
+
+ @Test
+ public void testCodePointAtCharArrayIntInt() {
+ CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30);
+ char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
+ char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
+ assertEquals((int) 'A', java4.codePointAt(cpAt3, 0, 2));
+ assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3, 5));
+ assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3, 4));
+
+ CharacterUtils java5 = CharacterUtils.getInstance(Version.LUCENE_31);
+ assertEquals((int) 'A', java5.codePointAt(cpAt3, 0, 2));
+ assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
+ cpAt3, 3, 5));
+ assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3, 4));
+
+ }
+
+ @Test
+ public void testNewCharacterBuffer() {
+ CharacterBuffer newCharacterBuffer = CharacterUtils.newCharacterBuffer(1024);
+ assertEquals(1024, newCharacterBuffer.getBuffer().length);
+ assertEquals(0, newCharacterBuffer.getOffset());
+ assertEquals(0, newCharacterBuffer.getLength());
+
+ newCharacterBuffer = CharacterUtils.newCharacterBuffer(2);
+ assertEquals(2, newCharacterBuffer.getBuffer().length);
+ assertEquals(0, newCharacterBuffer.getOffset());
+ assertEquals(0, newCharacterBuffer.getLength());
+
+ try {
+ newCharacterBuffer = CharacterUtils.newCharacterBuffer(1);
+ fail("length must be >= 2");
+ } catch (IllegalArgumentException e) {
+ }
+ }
+
+ @Test
+ public void testFillNoHighSurrogate() throws IOException {
+ Version[] versions = new Version[] { Version.LUCENE_30, Version.LUCENE_31 };
+ for (Version version : versions) {
+ CharacterUtils instance = CharacterUtils.getInstance(version);
+ Reader reader = new StringReader("helloworld");
+ CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
+ assertTrue(instance.fill(buffer,reader));
+ assertEquals(0, buffer.getOffset());
+ assertEquals(6, buffer.getLength());
+ assertEquals("hellow", new String(buffer.getBuffer()));
+ assertTrue(instance.fill(buffer,reader));
+ assertEquals(4, buffer.getLength());
+ assertEquals(0, buffer.getOffset());
+
+ assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
+ buffer.getLength()));
+ assertFalse(instance.fill(buffer,reader));
+ }
+ }
+
+ @Test
+ public void testFillJava15() throws IOException {
+ String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
+ CharacterUtils instance = CharacterUtils.getInstance(Version.LUCENE_31);
+ Reader reader = new StringReader(input);
+ CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
+ assertTrue(instance.fill(buffer, reader));
+ assertEquals(4, buffer.getLength());
+ assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(),
+ buffer.getLength()));
+ assertTrue(instance.fill(buffer, reader));
+ assertEquals(5, buffer.getLength());
+ assertEquals("\ud801\udc1c789", new String(buffer.getBuffer()));
+ assertTrue(instance.fill(buffer, reader));
+ assertEquals(4, buffer.getLength());
+ assertEquals("123\ud801", new String(buffer.getBuffer(),
+ buffer.getOffset(), buffer.getLength()));
+ assertTrue(instance.fill(buffer, reader));
+ assertEquals(2, buffer.getLength());
+ assertEquals("\ud801\udc1c", new String(buffer.getBuffer(), buffer
+ .getOffset(), buffer.getLength()));
+ assertTrue(instance.fill(buffer, reader));
+ assertEquals(1, buffer.getLength());
+ assertEquals("\ud801", new String(buffer.getBuffer(), buffer
+ .getOffset(), buffer.getLength()));
+ assertFalse(instance.fill(buffer, reader));
+ }
+
+ @Test
+ public void testFillJava14() throws IOException {
+ String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
+ CharacterUtils instance = CharacterUtils.getInstance(Version.LUCENE_30);
+ Reader reader = new StringReader(input);
+ CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
+ assertTrue(instance.fill(buffer, reader));
+ assertEquals(5, buffer.getLength());
+ assertEquals("1234\ud801", new String(buffer.getBuffer(), buffer
+ .getOffset(), buffer.getLength()));
+ assertTrue(instance.fill(buffer, reader));
+ assertEquals(5, buffer.getLength());
+ assertEquals("\udc1c7891", new String(buffer.getBuffer()));
+ buffer = CharacterUtils.newCharacterBuffer(6);
+ assertTrue(instance.fill(buffer, reader));
+ assertEquals(6, buffer.getLength());
+ assertEquals("23\ud801\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
+ .getOffset(), buffer.getLength()));
+ assertFalse(instance.fill(buffer, reader));
+
+ }
+
+}
Property changes on: src\test\org\apache\lucene\util\TestCharacterUtils.java
___________________________________________________________________
Added: svn:keywords
+ Date Author Id Revision HeadURL
Added: svn:eol-style
+ native