Index: lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java =================================================================== --- lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (revision 965887) +++ lucene/src/java/org/apache/lucene/analysis/tokenattributes/CharTermAttributeImpl.java (working copy) @@ -77,7 +77,7 @@ } // *** TermToBytesRefAttribute interface *** - public final int toBytesRef(BytesRef target) { + public int toBytesRef(BytesRef target) { return UnicodeUtil.UTF16toUTF8WithHash(termBuffer, 0, termLength, target); } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (revision 965887) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/core/KeywordTokenizer.java (working copy) @@ -30,7 +30,7 @@ */ public final class KeywordTokenizer extends Tokenizer { - private static final int DEFAULT_BUFFER_SIZE = 256; + public static final int DEFAULT_BUFFER_SIZE = 256; private boolean done = false; private int finalOffset; Index: modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CustomCharsetTermAttributeFactory.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CustomCharsetTermAttributeFactory.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CustomCharsetTermAttributeFactory.java (revision 0) @@ -0,0 +1,57 @@ +package org.apache.lucene.analysis.miscellaneous; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.nio.charset.Charset; + +import org.apache.lucene.analysis.miscellaneous.tokenattributes.CustomCharsetTermAttributeImpl; +import org.apache.lucene.util.Attribute; +import org.apache.lucene.util.AttributeImpl; +import org.apache.lucene.util.AttributeSource; + +/** + * TODO + */ +public class CustomCharsetTermAttributeFactory extends AttributeSource.AttributeFactory { + private final AttributeSource.AttributeFactory delegate; + private final Charset charset; + + public CustomCharsetTermAttributeFactory(Charset charset) { + this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, charset); + } + + public CustomCharsetTermAttributeFactory(String charset) { + this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, charset); + } + + public CustomCharsetTermAttributeFactory(AttributeSource.AttributeFactory delegate, Charset charset) { + this.delegate = delegate; + this.charset = charset; + } + + public CustomCharsetTermAttributeFactory(AttributeSource.AttributeFactory delegate, String charset) { + this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, Charset.forName(charset)); + } + + @Override + public AttributeImpl createAttributeInstance(Class attClass) { + return attClass.isAssignableFrom(CustomCharsetTermAttributeImpl.class) + ? new CustomCharsetTermAttributeImpl(charset.newEncoder()) + : delegate.createAttributeInstance(attClass); + } +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\miscellaneous\CustomCharsetTermAttributeFactory.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/tokenattributes/CustomCharsetTermAttributeImpl.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/tokenattributes/CustomCharsetTermAttributeImpl.java (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/tokenattributes/CustomCharsetTermAttributeImpl.java (revision 0) @@ -0,0 +1,73 @@ +package org.apache.lucene.analysis.miscellaneous.tokenattributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.CoderResult; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; +import org.apache.lucene.util.BytesRef; + +/** TODO */ +public class CustomCharsetTermAttributeImpl extends CharTermAttributeImpl { + + private final CharsetEncoder encoder; + private final int bytesPerChar; + + private CharBuffer cbuf = null; + // TODO: maybe use WeakReference for bbuf to not endless sit on unused array? + private ByteBuffer bbuf = null; + + public CustomCharsetTermAttributeImpl(CharsetEncoder encoder) { + this.encoder = encoder.onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + this.bytesPerChar = (int) Math.ceil(encoder.maxBytesPerChar()); + } + + @Override + public int toBytesRef(final BytesRef target) { + // reuse the previous input CharBuffer if possible + // (when backing array is same as term buffer) + if (cbuf == null || cbuf.array() != buffer()) { + cbuf = CharBuffer.wrap(buffer()); + } + cbuf.rewind().limit(length()); + + // reuse the previous output ByteBuffer if possible + // (when backing array is same as BytesRef's array) + target.offset = 0; + target.grow(length() * bytesPerChar); + if (bbuf == null || bbuf.array() != target.bytes) { + bbuf = ByteBuffer.wrap(target.bytes); + } + bbuf.rewind(); + + // encode the chars + encoder.reset(); + CoderResult res; + if ((res = encoder.encode(cbuf, bbuf, true)) != CoderResult.UNDERFLOW || (res = encoder.flush(bbuf)) != CoderResult.UNDERFLOW) + throw new RuntimeException("Charset encoding error occurred: " + res); + target.length = bbuf.position(); + + return target.hashCode(); + } + +} Property changes on: modules\analysis\common\src\java\org\apache\lucene\analysis\miscellaneous\tokenattributes\CustomCharsetTermAttributeImpl.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCustomCharsetTermAttribute.java =================================================================== --- modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCustomCharsetTermAttribute.java (revision 0) +++ modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCustomCharsetTermAttribute.java (revision 0) @@ -0,0 +1,70 @@ +package org.apache.lucene.analysis.miscellaneous; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.util.Random; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.analysis.miscellaneous.tokenattributes.CustomCharsetTermAttributeImpl; + +public class TestCustomCharsetTermAttribute extends BaseTokenStreamTestCase { + + /** simple roundtrip decode of the consumed terms */ + public void testUTF8RoundTrip() throws Exception { + Random random = newRandom(); + final KeywordTokenizer tokenizer = new KeywordTokenizer(new CustomCharsetTermAttributeFactory("UTF-8"), + new StringReader(""), KeywordTokenizer.DEFAULT_BUFFER_SIZE); + assertTrue("CharTermAttribute has wrong class", + tokenizer.getAttribute(CharTermAttribute.class) instanceof CustomCharsetTermAttributeImpl); + final TermToBytesRefAttribute termAtt = tokenizer.getAttribute(TermToBytesRefAttribute.class); + final BytesRef encoded = new BytesRef(); + + for (int i = 0; i < 10000 * _TestUtil.getRandomMultiplier(); i++) { + String randomString = _TestUtil.randomUnicodeString(random); + tokenizer.reset(new StringReader(randomString)); + assertTrue(tokenizer.incrementToken()); + termAtt.toBytesRef(encoded); + assertEquals(randomString, encoded.utf8ToString()); + assertFalse(tokenizer.incrementToken()); + } + } + + /** check that error handling works correct */ + public void testInvalidChars() throws Exception { + final AttributeSource atts = new AttributeSource(new CustomCharsetTermAttributeFactory("ISO-8859-1")); + final CharTermAttribute termAtt = atts.addAttribute(CharTermAttribute.class); + assertTrue("CharTermAttribute has wrong class", termAtt instanceof CustomCharsetTermAttributeImpl); + final TermToBytesRefAttribute bytesAtt = atts.getAttribute(TermToBytesRefAttribute.class); + termAtt.setEmpty().append("\u1234"); + try { + final BytesRef encoded = new BytesRef(); + bytesAtt.toBytesRef(encoded); + fail("It should not be possible to encode \\u1234 using ISO-8859-1"); + } catch (RuntimeException e) { + // pass + } + } + +} Property changes on: modules\analysis\common\src\test\org\apache\lucene\analysis\miscellaneous\TestCustomCharsetTermAttribute.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/BOCUAttributeFactory.java =================================================================== --- modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/BOCUAttributeFactory.java (revision 0) +++ modules/analysis/icu/src/java/org/apache/lucene/analysis/icu/BOCUAttributeFactory.java (revision 0) @@ -0,0 +1,50 @@ +package org.apache.lucene.analysis.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.nio.charset.Charset; + +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.analysis.miscellaneous.CustomCharsetTermAttributeFactory; + +/** + * Attribute factory for BOCU-1 compressed terms + */ +public class BOCUAttributeFactory extends CustomCharsetTermAttributeFactory { + + // dynamic bind to ICU charsets API, so we dont need to ship the (large) JAR file + public static final Charset BOCU_CHARSET; + static { + try { + BOCU_CHARSET = (Charset) Class.forName("com.ibm.icu.charset.CharsetICU") + .getMethod("forNameICU", String.class).invoke(null,"BOCU-1"); + } catch (Exception e) { + throw new RuntimeException("To use " + BOCUAttributeFactory.class.getName() + + " you need to add icu4j-charsets.jar to your classpath.", e); + } + } + + public BOCUAttributeFactory() { + super(BOCU_CHARSET); + } + + public BOCUAttributeFactory(AttributeSource.AttributeFactory delegate) { + super(delegate, BOCU_CHARSET); + } + +} Property changes on: modules\analysis\icu\src\java\org\apache\lucene\analysis\icu\BOCUAttributeFactory.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native Index: modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestBOCU.java =================================================================== --- modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestBOCU.java (revision 0) +++ modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestBOCU.java (revision 0) @@ -0,0 +1,59 @@ +package org.apache.lucene.analysis.icu; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.StringReader; +import java.nio.ByteBuffer; +import java.nio.charset.CharsetDecoder; +import java.util.Random; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util._TestUtil; + +public class TestBOCU extends BaseTokenStreamTestCase { + /** simple roundtrip decode of the consumed terms */ + public void testRandomTerms() throws Exception { + final CharsetDecoder decoder; + try { + decoder = BOCUAttributeFactory.BOCU_CHARSET.newDecoder(); + } catch (java.lang.ExceptionInInitializerError e) { + System.err.println("WARNING: " + e.getCause().getMessage()); + System.err.println("Tests were not executed!"); + return; + } + + Random random = newRandom(); + final KeywordTokenizer tokenizer = new KeywordTokenizer(new BOCUAttributeFactory(), + new StringReader(""), KeywordTokenizer.DEFAULT_BUFFER_SIZE); + final TermToBytesRefAttribute termAtt = tokenizer.getAttribute(TermToBytesRefAttribute.class); + final BytesRef encoded = new BytesRef(); + + for (int i = 0; i < 10000 * _TestUtil.getRandomMultiplier(); i++) { + String randomString = _TestUtil.randomUnicodeString(random); + tokenizer.reset(new StringReader(randomString)); + assertTrue(tokenizer.incrementToken()); + termAtt.toBytesRef(encoded); + ByteBuffer buffer = ByteBuffer.wrap(encoded.bytes, encoded.offset, encoded.length); + assertEquals(randomString, decoder.decode(buffer).toString()); + assertFalse(tokenizer.incrementToken()); + } + } +} Property changes on: modules\analysis\icu\src\test\org\apache\lucene\analysis\icu\TestBOCU.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native