Index: lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (revision 965887) +++ lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (working copy) @@ -77,7 +77,7 @@ } // *** TermToBytesRefAttribute interface *** - public final int toBytesRef(BytesRef target) { + public int toBytesRef(BytesRef target) { return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, target); } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (revision 965887) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (working copy) @@ -30,7 +30,7 @@ */ public final class KeywordTokenizer extends Tokenizer { - private static final int DEFAULT_BUFFER_SIZE = 256; + public static final int DEFAULT_BUFFER_SIZE = 256; private boolean done = false; private int finalOffset; Index: modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/BOCUAttributeFactory.java =================================================================== --- modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/BOCUAttributeFactory.java (revision 0) +++ modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/BOCUAttributeFactory.java (revision 0) @@ -0,0 +1,46 @@ +package org.apache.lucene.analysis.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.icu.tokenattributes.BOCUTermAttributeImpl; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeSource; + +/** + * Attribute factory for BOCU-1 compressed terms + */ +public class BOCUAttributeFactory extends AttributeSource.AttributeFactory { + private final AttributeSource.AttributeFactory delegate; + + public BOCUAttributeFactory() { + this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); + } + + public BOCUAttributeFactory(AttributeSource.AttributeFactory delegate) { + this.delegate = delegate; + } + + @Override + public AttributeImpl createAttributeInstance( + Class attClass) { + return attClass.isAssignableFrom(BOCUTermAttributeImpl.class) + ? new BOCUTermAttributeImpl() + : delegate.createAttributeInstance(attClass); + } +} Property changes on: modules\analysis\icu\src\java\org\apache\lucene\analysis\icu\BOCUAttributeFactory.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/BOCUTermAttributeImpl.java =================================================================== --- modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/BOCUTermAttributeImpl.java (revision 0) +++ modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/tokenattributes/BOCUTermAttributeImpl.java (revision 0) @@ -0,0 +1,62 @@ +package org.apache.lucene.analysis.icu.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; +import org.apache.lucene.util.BytesRef; + +import com.ibm.icu.charset.CharsetICU; + +/** Impl of BOCU-1 compressed terms */ +public class BOCUTermAttributeImpl extends CharTermAttributeImpl { + + private final CharsetEncoder encoder = CharsetICU.forNameICU("BOCU-1").newEncoder(); + + private char[] lastCharArray = null; + private CharBuffer cbuf = null; + private byte[] lastByteArray = null; + private ByteBuffer bbuf = null; + + @Override + public int toBytesRef(final BytesRef target) { + target.offset = 0; + target.grow((int) Math.ceil(length() * encoder.maxBytesPerChar())); + if (cbuf == null || lastCharArray != buffer()) { + lastCharArray = buffer(); + cbuf = CharBuffer.wrap(lastCharArray); + } + if (bbuf == null || lastByteArray != target.bytes) { + lastByteArray = target.bytes; + bbuf = ByteBuffer.wrap(lastByteArray); + } + cbuf.rewind().limit(length()); + bbuf.rewind().limit(target.bytes.length); + encoder.reset(); + encoder.encode(cbuf, bbuf, true); + if (encoder.flush(bbuf) != CoderResult.UNDERFLOW) + throw new RuntimeException("Buffer too small"); + target.length = bbuf.position(); + return target.hashCode(); + } + +} Property changes on: modules\analysis\icu\src\java\org\apache\lucene\analysis\icu\tokenattributes\BOCUTermAttributeImpl.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestBOCU.java =================================================================== --- modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestBOCU.java (revision 0) +++ modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestBOCU.java (revision 0) @@ -0,0 +1,52 @@ +package org.apache.lucene.analysis.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.nio.ByteBuffer; +import java.nio.charset.CharsetDecoder; +import java.util.Random; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util._TestUtil; + +import com.ibm.icu.charset.CharsetICU; + +public class TestBOCU extends BaseTokenStreamTestCase { + /** simple roundtrip decode of the consumed terms */ + public void testRandomTerms() throws Exception { + Random random = newRandom(); + CharsetDecoder decoder = CharsetICU.forNameICU("BOCU-1").newDecoder(); + KeywordTokenizer tokenizer = new KeywordTokenizer(new BOCUAttributeFactory(), + new StringReader(""), KeywordTokenizer.DEFAULT_BUFFER_SIZE); + TermToBytesRefAttribute termAtt = tokenizer.getAttribute(TermToBytesRefAttribute.class); + BytesRef encoded = new BytesRef(); + + for (int i = 0; i < 10000 * _TestUtil.getRandomMultiplier(); i++) { + String randomString = _TestUtil.randomUnicodeString(random); + tokenizer.reset(new StringReader(randomString)); + assertTrue(tokenizer.incrementToken()); + termAtt.toBytesRef(encoded); + ByteBuffer buffer = ByteBuffer.wrap(encoded.bytes, encoded.offset, encoded.length); + assertEquals(randomString, decoder.decode(buffer).toString()); + } + } +} Property changes on: modules\analysis\icu\src\test\org\apache\lucene\analysis\icu\TestBOCU.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native