Index: modules/analysis/CHANGES.txt =================================================================== --- modules/analysis/CHANGES.txt (revision 1098593) +++ modules/analysis/CHANGES.txt (working copy) @@ -83,6 +83,8 @@ - o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase - o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase - o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader + - o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer + - o.a.l.util.CharacterUtils -> o.a.l.analysis.util.CharacterUtils * SOLR-1057: Add PathHierarchyTokenizer that represents file path hierarchies as synonyms of /something, /something/something, /something/something/else. (Ryan McKinley, Koji Sekiguchi) Index: modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java (revision 1098593) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharTokenizers.java (working copy) @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis; +package org.apache.lucene.analysis.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,7 +20,11 @@ import java.io.IOException; import java.io.StringReader; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseTokenizer; + /** * Testcase for {@link CharTokenizer} subclasses */ @@ -42,7 +46,7 @@ } // internal buffer size is 1024 make sure we have a surrogate pair right at the border builder.insert(1023, "\ud801\udc1c"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true); + Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString())); assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" ")); } @@ -59,7 +63,7 @@ builder.append("a"); } builder.append("\ud801\udc1cabc"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true); + Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()}); } } @@ -73,7 +77,7 @@ for (int i = 0; i < 255; i++) { builder.append("A"); } - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true); + Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); } @@ -87,7 +91,7 @@ builder.append("A"); } builder.append("\ud801\udc1c"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true); + Tokenizer tokenizer = new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); } } Index: modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java (revision 1098593) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java (working copy) @@ -1,4 +1,4 @@ -package org.apache.lucene.util; +package org.apache.lucene.analysis.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,7 +21,9 @@ import java.io.Reader; import java.io.StringReader; -import org.apache.lucene.util.CharacterUtils.CharacterBuffer; +import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; import org.junit.Test; /** Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseTokenizer.java (working copy) @@ -19,8 +19,8 @@ import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java (working copy) @@ -20,7 +20,7 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CharTokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.util.ReusableAnalyzerBase; import org.apache.lucene.util.Version; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceAnalyzer.java (working copy) @@ -19,7 +19,7 @@ import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.util.ReusableAnalyzerBase; import org.apache.lucene.util.Version; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/WhitespaceTokenizer.java (working copy) @@ -19,8 +19,8 @@ import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/LetterTokenizer.java (working copy) @@ -19,8 +19,8 @@ import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java (working copy) @@ -22,7 +22,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.Version; /** Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java (working copy) @@ -18,8 +18,8 @@ */ import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.Tokenizer; // for javadocs +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs import org.apache.lucene.util.AttributeSource; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java (working copy) @@ -21,7 +21,7 @@ import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.Version; /** Index: modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java (working copy) @@ -18,8 +18,8 @@ import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/in/IndicTokenizer.java (working copy) @@ -19,7 +19,7 @@ import java.io.Reader; -import org.apache.lucene.analysis.CharTokenizer; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java (working copy) @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis; +package org.apache.lucene.analysis.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,12 +20,13 @@ import java.io.IOException; import java.io.Reader; +import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.Version; -import org.apache.lucene.util.CharacterUtils.CharacterBuffer; +import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer; /** * An abstract base class for simple, character-oriented tokenizers. Index: modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java (working copy) @@ -24,7 +24,7 @@ import java.util.Map; import java.util.Set; -import org.apache.lucene.util.CharacterUtils; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.Version; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java (revision 1098593) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java (working copy) @@ -1,8 +1,10 @@ -package org.apache.lucene.util; +package org.apache.lucene.analysis.util; import java.io.IOException; import java.io.Reader; +import org.apache.lucene.util.Version; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with Index: lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java =================================================================== --- lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java (revision 1098593) +++ lucene/src/test/org/apache/lucene/analysis/TestCharTokenizers.java (working copy) @@ -1,93 +0,0 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; - - -/** - * Testcase for {@link CharTokenizer} subclasses - */ -public class TestCharTokenizers extends BaseTokenStreamTestCase { - - /* - * test to read surrogate pairs without loosing the pairing - * if the surrogate pair is at the border of the internal IO buffer - */ - public void testReadSupplementaryChars() throws IOException { - StringBuilder builder = new StringBuilder(); - // create random input - int num = 1024 + random.nextInt(1024); - num *= RANDOM_MULTIPLIER; - for (int i = 1; i < num; i++) { - builder.append("\ud801\udc1cabc"); - if((i % 10) == 0) - builder.append(" "); - } - // internal buffer size is 1024 make sure we have a surrogate pair right at the border - builder.insert(1023, "\ud801\udc1c"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true); - assertTokenStreamContents(tokenizer, builder.toString().toLowerCase().split(" ")); - } - - /* - * test to extend the buffer TermAttribute buffer internally. If the internal - * alg that extends the size of the char array only extends by 1 char and the - * next char to be filled in is a supplementary codepoint (using 2 chars) an - * index out of bound exception is triggered. - */ - public void testExtendCharBuffer() throws IOException { - for (int i = 0; i < 40; i++) { - StringBuilder builder = new StringBuilder(); - for (int j = 0; j < 1+i; j++) { - builder.append("a"); - } - builder.append("\ud801\udc1cabc"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString()), MockTokenizer.SIMPLE, true); - assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase()}); - } - } - - /* - * tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens - */ - public void testMaxWordLength() throws IOException { - StringBuilder builder = new StringBuilder(); - - for (int i = 0; i < 255; i++) { - builder.append("A"); - } - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true); - assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); - } - - /* - * tests the max word length of 255 with a surrogate pair at position 255 - */ - public void testMaxWordLengthWithSupplementary() throws IOException { - StringBuilder builder = new StringBuilder(); - - for (int i = 0; i < 254; i++) { - builder.append("A"); - } - builder.append("\ud801\udc1c"); - MockTokenizer tokenizer = new MockTokenizer(new StringReader(builder.toString() + builder.toString()), MockTokenizer.SIMPLE, true); - assertTokenStreamContents(tokenizer, new String[] {builder.toString().toLowerCase(), builder.toString().toLowerCase()}); - } -} Index: lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java =================================================================== --- lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java (revision 1098593) +++ lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java (working copy) @@ -616,7 +616,7 @@ } for(int i=start;i= 2"); - } catch (IllegalArgumentException e) { - } - } - - @Test - public void testFillNoHighSurrogate() throws IOException { - Version[] versions = new Version[] { Version.LUCENE_30, TEST_VERSION_CURRENT }; - for (Version version : versions) { - CharacterUtils instance = CharacterUtils.getInstance(version); - Reader reader = new StringReader("helloworld"); - CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6); - assertTrue(instance.fill(buffer,reader)); - assertEquals(0, buffer.getOffset()); - assertEquals(6, buffer.getLength()); - assertEquals("hellow", new String(buffer.getBuffer())); - assertTrue(instance.fill(buffer,reader)); - assertEquals(4, buffer.getLength()); - assertEquals(0, buffer.getOffset()); - - assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(), - buffer.getLength())); - assertFalse(instance.fill(buffer,reader)); - } - } - - @Test - public void testFillJava15() throws IOException { - String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801"; - CharacterUtils instance = CharacterUtils.getInstance(TEST_VERSION_CURRENT); - Reader reader = new StringReader(input); - CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5); - assertTrue(instance.fill(buffer, reader)); - assertEquals(4, buffer.getLength()); - assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(), - buffer.getLength())); - assertTrue(instance.fill(buffer, reader)); - assertEquals(5, buffer.getLength()); - assertEquals("\ud801\udc1c789", new String(buffer.getBuffer())); - assertTrue(instance.fill(buffer, reader)); - assertEquals(4, buffer.getLength()); - assertEquals("123\ud801", new String(buffer.getBuffer(), - buffer.getOffset(), buffer.getLength())); - assertTrue(instance.fill(buffer, reader)); - assertEquals(2, buffer.getLength()); - assertEquals("\ud801\udc1c", new String(buffer.getBuffer(), buffer - .getOffset(), buffer.getLength())); - assertTrue(instance.fill(buffer, reader)); - assertEquals(1, buffer.getLength()); - assertEquals("\ud801", new String(buffer.getBuffer(), buffer - .getOffset(), buffer.getLength())); - assertFalse(instance.fill(buffer, reader)); - } - - @Test - public void testFillJava14() throws IOException { - String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801"; - CharacterUtils instance = CharacterUtils.getInstance(Version.LUCENE_30); - Reader reader = new StringReader(input); - CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5); - assertTrue(instance.fill(buffer, reader)); - assertEquals(5, buffer.getLength()); - assertEquals("1234\ud801", new String(buffer.getBuffer(), buffer - .getOffset(), buffer.getLength())); - assertTrue(instance.fill(buffer, reader)); - assertEquals(5, buffer.getLength()); - assertEquals("\udc1c7891", new String(buffer.getBuffer())); - buffer = CharacterUtils.newCharacterBuffer(6); - assertTrue(instance.fill(buffer, reader)); - assertEquals(6, buffer.getLength()); - assertEquals("23\ud801\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer - .getOffset(), buffer.getLength())); - assertFalse(instance.fill(buffer, reader)); - - } - -} Index: lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java (revision 1098593) +++ lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java (working copy) @@ -1,202 +0,0 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.Reader; - -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.CharacterUtils; -import org.apache.lucene.util.Version; -import org.apache.lucene.util.CharacterUtils.CharacterBuffer; - -/** - * An abstract base class for simple, character-oriented tokenizers. - *

- * You must specify the required {@link Version} compatibility - * when creating {@link CharTokenizer}: - *

- *

- * A new {@link CharTokenizer} API has been introduced with Lucene 3.1. This API - * moved from UTF-16 code units to UTF-32 codepoints to eventually add support - * for supplementary characters. The old char based API has been - * deprecated and should be replaced with the int based methods - * {@link #isTokenChar(int)} and {@link #normalize(int)}. - *

- *

- * As of Lucene 3.1 each {@link CharTokenizer} - constructor expects a - * {@link Version} argument. Based on the given {@link Version} either the new - * API or a backwards compatibility layer is used at runtime. For - * {@link Version} < 3.1 the backwards compatibility layer ensures correct - * behavior even for indexes build with previous versions of Lucene. If a - * {@link Version} >= 3.1 is used {@link CharTokenizer} requires the new API to - * be implemented by the instantiated class. Yet, the old char based API - * is not required anymore even if backwards compatibility must be preserved. - * {@link CharTokenizer} subclasses implementing the new API are fully backwards - * compatible if instantiated with {@link Version} < 3.1. - *

- *

- * Note: If you use a subclass of {@link CharTokenizer} with {@link Version} >= - * 3.1 on an index build with a version < 3.1, created tokens might not be - * compatible with the terms in your index. - *

- **/ -public abstract class CharTokenizer extends Tokenizer { - - /** - * Creates a new {@link CharTokenizer} instance - * - * @param matchVersion - * Lucene version to match See {@link above} - * @param input - * the input to split up into tokens - */ - public CharTokenizer(Version matchVersion, Reader input) { - super(input); - charUtils = CharacterUtils.getInstance(matchVersion); - } - - /** - * Creates a new {@link CharTokenizer} instance - * - * @param matchVersion - * Lucene version to match See {@link above} - * @param source - * the attribute source to use for this {@link Tokenizer} - * @param input - * the input to split up into tokens - */ - public CharTokenizer(Version matchVersion, AttributeSource source, - Reader input) { - super(source, input); - charUtils = CharacterUtils.getInstance(matchVersion); - } - - /** - * Creates a new {@link CharTokenizer} instance - * - * @param matchVersion - * Lucene version to match See {@link above} - * @param factory - * the attribute factory to use for this {@link Tokenizer} - * @param input - * the input to split up into tokens - */ - public CharTokenizer(Version matchVersion, AttributeFactory factory, - Reader input) { - super(factory, input); - charUtils = CharacterUtils.getInstance(matchVersion); - } - - private int offset = 0, bufferIndex = 0, dataLen = 0, finalOffset = 0; - private static final int MAX_WORD_LEN = 255; - private static final int IO_BUFFER_SIZE = 4096; - - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - - private final CharacterUtils charUtils; - private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE); - - /** - * Returns true iff a codepoint should be included in a token. This tokenizer - * generates as tokens adjacent sequences of codepoints which satisfy this - * predicate. Codepoints for which this is false are used to define token - * boundaries and are not included in tokens. - */ - protected abstract boolean isTokenChar(int c); - - /** - * Called on each token character to normalize it before it is added to the - * token. The default implementation does nothing. Subclasses may use this to, - * e.g., lowercase tokens. - */ - protected int normalize(int c) { - return c; - } - - @Override - public final boolean incrementToken() throws IOException { - clearAttributes(); - int length = 0; - int start = -1; // this variable is always initialized - char[] buffer = termAtt.buffer(); - while (true) { - if (bufferIndex >= dataLen) { - offset += dataLen; - if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils - dataLen = 0; // so next offset += dataLen won't decrement offset - if (length > 0) { - break; - } else { - finalOffset = correctOffset(offset); - return false; - } - } - dataLen = ioBuffer.getLength(); - bufferIndex = 0; - } - // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone - final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex); - bufferIndex += Character.charCount(c); - - if (isTokenChar(c)) { // if it's a token char - if (length == 0) { // start of token - assert start == -1; - start = offset + bufferIndex - 1; - } else if (length >= buffer.length-1) { // check if a supplementary could run out of bounds - buffer = termAtt.resizeBuffer(2+length); // make sure a supplementary fits in the buffer - } - length += Character.toChars(normalize(c), buffer, length); // buffer it, normalized - if (length >= MAX_WORD_LEN) // buffer overflow! make sure to check for >= surrogate pair could break == test - break; - } else if (length > 0) // at non-Letter w/ chars - break; // return 'em - } - - termAtt.setLength(length); - assert start != -1; - offsetAtt.setOffset(correctOffset(start), finalOffset = correctOffset(start+length)); - return true; - - } - - @Override - public final void end() { - // set final offset - offsetAtt.setOffset(finalOffset, finalOffset); - } - - @Override - public void reset(Reader input) throws IOException { - super.reset(input); - bufferIndex = 0; - offset = 0; - dataLen = 0; - finalOffset = 0; - ioBuffer.reset(); // make sure to reset the IO buffer!! - } -} \ No newline at end of file Index: lucene/src/java/org/apache/lucene/util/CharacterUtils.java =================================================================== --- lucene/src/java/org/apache/lucene/util/CharacterUtils.java (revision 1098593) +++ lucene/src/java/org/apache/lucene/util/CharacterUtils.java (working copy) @@ -1,286 +0,0 @@ -package org.apache.lucene.util; - -import java.io.IOException; -import java.io.Reader; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * {@link CharacterUtils} provides a unified interface to Character-related - * operations to implement backwards compatible character operations based on a - * {@link Version} instance. - * - * @lucene.internal - */ -public abstract class CharacterUtils { - private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils(); - private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils(); - - /** - * Returns a {@link CharacterUtils} implementation according to the given - * {@link Version} instance. - * - * @param matchVersion - * a version instance - * @return a {@link CharacterUtils} implementation according to the given - * {@link Version} instance. - */ - public static CharacterUtils getInstance(final Version matchVersion) { - return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4; - } - - /** - * Returns the code point at the given index of the char array. - * Depending on the {@link Version} passed to - * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior - * of {@link Character#codePointAt(char[], int)} as it would have been - * available on a Java 1.4 JVM or on a later virtual machine version. - * - * @param chars - * a character array - * @param offset - * the offset to the char values in the chars array to be converted - * - * @return the Unicode code point at the given index - * @throws NullPointerException - * - if the array is null. - * @throws IndexOutOfBoundsException - * - if the value offset is negative or not less than the length of - * the char array. - */ - public abstract int codePointAt(final char[] chars, final int offset); - - /** - * Returns the code point at the given index of the {@link CharSequence}. - * Depending on the {@link Version} passed to - * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior - * of {@link Character#codePointAt(char[], int)} as it would have been - * available on a Java 1.4 JVM or on a later virtual machine version. - * - * @param seq - * a character sequence - * @param offset - * the offset to the char values in the chars array to be converted - * - * @return the Unicode code point at the given index - * @throws NullPointerException - * - if the sequence is null. - * @throws IndexOutOfBoundsException - * - if the value offset is negative or not less than the length of - * the character sequence. - */ - public abstract int codePointAt(final CharSequence seq, final int offset); - - /** - * Returns the code point at the given index of the char array where only elements - * with index less than the limit are used. - * Depending on the {@link Version} passed to - * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior - * of {@link Character#codePointAt(char[], int)} as it would have been - * available on a Java 1.4 JVM or on a later virtual machine version. - * - * @param chars - * a character array - * @param offset - * the offset to the char values in the chars array to be converted - * @param limit the index afer the last element that should be used to calculate - * codepoint. - * - * @return the Unicode code point at the given index - * @throws NullPointerException - * - if the array is null. - * @throws IndexOutOfBoundsException - * - if the value offset is negative or not less than the length of - * the char array. - */ - public abstract int codePointAt(final char[] chars, final int offset, final int limit); - - /** - * Creates a new {@link CharacterBuffer} and allocates a char[] - * of the given bufferSize. - * - * @param bufferSize - * the internal char buffer size, must be >= 2 - * @return a new {@link CharacterBuffer} instance. - */ - public static CharacterBuffer newCharacterBuffer(final int bufferSize) { - if(bufferSize < 2) - throw new IllegalArgumentException("buffersize must be >= 2"); - return new CharacterBuffer(new char[bufferSize], 0, 0); - } - - /** - * Fills the {@link CharacterBuffer} with characters read from the given - * reader {@link Reader}. This method tries to read as many characters into - * the {@link CharacterBuffer} as possible, each call to fill will start - * filling the buffer from offset 0 up to the length of the size - * of the internal character array. - *

- * Depending on the {@link Version} passed to - * {@link CharacterUtils#getInstance(Version)} this method implements - * supplementary character awareness when filling the given buffer. For all - * {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader)} guarantees - * that the given {@link CharacterBuffer} will never contain a high surrogate - * character as the last element in the buffer unless it is the last available - * character in the reader. In other words, high and low surrogate pairs will - * always be preserved across buffer boarders. - *

- * - * @param buffer - * the buffer to fill. - * @param reader - * the reader to read characters from. - * @return true if and only if no more characters are available - * in the reader, otherwise false. - * @throws IOException - * if the reader throws an {@link IOException}. - */ - public abstract boolean fill(CharacterBuffer buffer, Reader reader) throws IOException; - - private static final class Java5CharacterUtils extends CharacterUtils { - Java5CharacterUtils() { - } - - @Override - public final int codePointAt(final char[] chars, final int offset) { - return Character.codePointAt(chars, offset); - } - - @Override - public int codePointAt(final CharSequence seq, final int offset) { - return Character.codePointAt(seq, offset); - } - - @Override - public int codePointAt(final char[] chars, final int offset, final int limit) { - return Character.codePointAt(chars, offset, limit); - } - - @Override - public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException { - final char[] charBuffer = buffer.buffer; - buffer.offset = 0; - charBuffer[0] = buffer.lastTrailingHighSurrogate; - final int offset = buffer.lastTrailingHighSurrogate == 0 ? 0 : 1; - buffer.lastTrailingHighSurrogate = 0; - final int read = reader.read(charBuffer, offset, charBuffer.length - - offset); - if (read == -1) { - buffer.length = offset; - return offset != 0; - } - buffer.length = read + offset; - // special case if the read returns 0 and the lastTrailingHighSurrogate was set - if (buffer.length > 1 - && Character.isHighSurrogate(charBuffer[buffer.length - 1])) { - buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length]; - } - return true; - } - } - - private static final class Java4CharacterUtils extends CharacterUtils { - Java4CharacterUtils() { - } - - @Override - public final int codePointAt(final char[] chars, final int offset) { - return chars[offset]; - } - - @Override - public int codePointAt(final CharSequence seq, final int offset) { - return seq.charAt(offset); - } - - @Override - public int codePointAt(final char[] chars, final int offset, final int limit) { - if(offset >= limit) - throw new IndexOutOfBoundsException("offset must be less than limit"); - return chars[offset]; - } - - @Override - public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException { - buffer.offset = 0; - final int read = reader.read(buffer.buffer); - if(read == -1) - return false; - buffer.length = read; - return true; - } - - } - - /** - * A simple IO buffer to use with - * {@link CharacterUtils#fill(CharacterBuffer, Reader)}. - */ - public static final class CharacterBuffer { - - private final char[] buffer; - private int offset; - private int length; - private char lastTrailingHighSurrogate = 0; - - CharacterBuffer(char[] buffer, int offset, int length) { - this.buffer = buffer; - this.offset = offset; - this.length = length; - } - - /** - * Returns the internal buffer - * - * @return the buffer - */ - public char[] getBuffer() { - return buffer; - } - - /** - * Returns the data offset in the internal buffer. - * - * @return the offset - */ - public int getOffset() { - return offset; - } - - /** - * Return the length of the data in the internal buffer starting at - * {@link #getOffset()} - * - * @return the length - */ - public int getLength() { - return length; - } - - /** - * Resets the CharacterBuffer. All internals are reset to its default - * values. - */ - public void reset() { - offset = 0; - length = 0; - lastTrailingHighSurrogate = 0; - } - } - -} Index: lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java =================================================================== --- lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java (revision 1098593) +++ lucene/src/test-framework/org/apache/lucene/analysis/MockTokenizer.java (working copy) @@ -20,14 +20,15 @@ import java.io.IOException; import java.io.Reader; -import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.RegExp; /** * Automaton-based tokenizer for testing. Optionally lowercases. */ -public class MockTokenizer extends CharTokenizer { +public class MockTokenizer extends Tokenizer { /** Acts Similar to WhitespaceTokenizer */ public static final CharacterRunAutomaton WHITESPACE = new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").toAutomaton()); @@ -45,21 +46,67 @@ private final boolean lowerCase; private int state; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + int off = 0; + public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) { - super(LuceneTestCase.TEST_VERSION_CURRENT, factory, input); + super(factory, input); this.runAutomaton = runAutomaton; this.lowerCase = lowerCase; this.state = runAutomaton.getInitialState(); } public MockTokenizer(Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase) { - super(LuceneTestCase.TEST_VERSION_CURRENT, input); + super(input); this.runAutomaton = runAutomaton; this.lowerCase = lowerCase; this.state = runAutomaton.getInitialState(); } @Override + public final boolean incrementToken() throws IOException { + clearAttributes(); + for (;;) { + int startOffset = off; + int cp = readCodePoint(); + if (cp < 0) { + break; + } else if (isTokenChar(cp)) { + int endOffset; + do { + char chars[] = Character.toChars(normalize(cp)); + for (int i = 0; i < chars.length; i++) + termAtt.append(chars[i]); + endOffset = off; + cp = readCodePoint(); + } while (cp >= 0 && isTokenChar(cp)); + offsetAtt.setOffset(startOffset, endOffset); + return true; + } + } + return false; + } + + protected int readCodePoint() throws IOException { + int ch = input.read(); + if (ch < 0) { + return ch; + } else { + assert !Character.isLowSurrogate((char) ch); + off++; + if (Character.isHighSurrogate((char) ch)) { + int ch2 = input.read(); + if (ch2 >= 0) { + off++; + assert Character.isLowSurrogate((char) ch2); + return Character.toCodePoint((char) ch, (char) ch2); + } + } + return ch; + } + } + protected boolean isTokenChar(int c) { state = runAutomaton.step(state, c); if (state < 0) { @@ -70,7 +117,6 @@ } } - @Override protected int normalize(int c) { return lowerCase ? Character.toLowerCase(c) : c; } @@ -79,5 +125,12 @@ public void reset() throws IOException { super.reset(); state = runAutomaton.getInitialState(); + off = 0; } + + @Override + public void end() throws IOException { + int finalOffset = correctOffset(off); + offsetAtt.setOffset(finalOffset, finalOffset); + } } Index: lucene/MIGRATE.txt =================================================================== --- lucene/MIGRATE.txt (revision 1098593) +++ lucene/MIGRATE.txt (working copy) @@ -312,6 +312,8 @@ - o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase - o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase - o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader + - o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer + - o.a.l.util.CharacterUtils -> o.a.l.analysis.util.CharacterUtils * LUCENE-2514: The option to use a Collator's order (instead of binary order) for sorting and range queries has been moved to contrib/queries.