diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/TermsEnum.java index d154200..9957ec1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermsEnum.java @@ -23,6 +23,7 @@ import java.util.Comparator; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; /** Iterator to seek ({@link #seekCeil(BytesRef)}, {@link * #seekExact(BytesRef,boolean)}) or step through ({@link @@ -40,7 +41,7 @@ import org.apache.lucene.util.BytesRef; * of the seek methods. * * @lucene.experimental */ -public abstract class TermsEnum { +public abstract class TermsEnum implements BytesRefIterator { private AttributeSource atts = null; @@ -114,14 +115,6 @@ public abstract class TermsEnum { } } - /** Increments the enumeration to the next term. - * Returns the resulting term, or null if the end was - * hit (which means the enum is unpositioned). The - * returned BytesRef may be re-used across calls to next. - * After this method returns null, do not call it again: - * the results are undefined. */ - public abstract BytesRef next() throws IOException; - /** Returns current term. Do not call this when the enum * is unpositioned. */ public abstract BytesRef term() throws IOException; diff --git a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java index 1d8ebd2..cb56a51 100644 --- a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java +++ b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java @@ -281,6 +281,37 @@ public final class ByteBlockPool { } /** + * + */ + public final BytesRef copyFrom(final BytesRef bytes) { + final int length = bytes.length; + final int offset = bytes.offset; + bytes.offset = 0; + bytes.grow(length); + int bufferIndex = offset >> BYTE_BLOCK_SHIFT; + byte[] buffer = buffers[bufferIndex]; + int pos = offset & BYTE_BLOCK_MASK; + int overflow = (pos + length) - BYTE_BLOCK_SIZE; + do { + if (overflow <= 0) { + System.arraycopy(buffer, pos, bytes.bytes, bytes.offset, bytes.length); + bytes.length = length; + bytes.offset = 0; + break; + } else { + final int bytesToCopy = length - overflow; + System.arraycopy(buffer, pos, bytes.bytes, bytes.offset, bytesToCopy); + pos = 0; + bytes.length -= bytesToCopy; + bytes.offset += bytesToCopy; + buffer = buffers[bufferIndex]; + overflow = overflow - BYTE_BLOCK_SIZE; + } + } while (true); + return bytes; + } + + /** * Writes the pools content to the given {@link DataOutput} */ public final void writePool(final DataOutput out) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefIterator.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefIterator.java new file mode 100644 index 0000000..5809bb5 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefIterator.java @@ -0,0 +1,52 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +/** + * A simple iterator interface for {@link BytesRef} iteration + * + */ +public interface BytesRefIterator { + + public static final BytesRefIterator EMPTY_ITERATOR = new EmptyBytesRefIterator(); + + /** + * Increments the iteration to the next {@link BytesRef} in the iterator. + * Returns the resulting {@link BytesRef} or null if the end of + * the iterator is reached. The returned BytesRef may be re-used across calls + * to next. After this method returns null, do not call it again: the results + * are undefined. + * + * @return the next {@link BytesRef} in the iterator or null if + * the end of the iterator is reached. + * @throws IOException + */ + public BytesRef next() throws IOException; + + public final static class EmptyBytesRefIterator implements BytesRefIterator { + + @Override + public BytesRef next() throws IOException { + return null; + } + + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefList.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefList.java new file mode 100644 index 0000000..2630a7c --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefList.java @@ -0,0 +1,119 @@ +package org.apache.lucene.util; + +import java.io.IOException; +import java.util.Comparator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +public class BytesRefList { + + private final ByteBlockPool pool; + private int[] offsets = new int[1]; + private int currentElement = 0; + private int currentOffset = 0; + + public BytesRefList() { + this(new ByteBlockPool(new ByteBlockPool.DirectAllocator())); + } + + public BytesRefList(ByteBlockPool pool) { + this.pool = pool; + pool.nextBuffer(); + } + + public int append(BytesRef bytes) { + if (currentElement >= offsets.length) { + offsets = ArrayUtil.grow(offsets, offsets.length + 1); + } + pool.copy(bytes); + offsets[currentElement++] = currentOffset; + currentOffset += bytes.length; + return currentElement; + } + + public int size() { + return currentElement; + } + + public BytesRef get(BytesRef bytes, int pos) { + if (currentElement > pos) { + bytes.offset = offsets[pos]; + bytes.length = pos == currentElement - 1 ? currentOffset - bytes.offset + : offsets[pos + 1] - bytes.offset; + pool.copyFrom(bytes); + return bytes; + } + throw new IndexOutOfBoundsException("index " + pos + + " must be less than the size: " + currentElement); + + } + + public BytesRefIterator iterator() { + final int numElements = currentElement; + + return new BytesRefIterator() { + private final BytesRef spare = new BytesRef(); + private int pos = 0; + + @Override + public BytesRef next() throws IOException { + if (pos < numElements) { + get(spare, pos++); + return spare; + } + return null; + } + }; + } + + public int[] sort(final Comparator comp) { + final int[] orderdEntries = new int[size()]; + for (int i = 0; i < orderdEntries.length; i++) { + orderdEntries[i] = i; + } + new SorterTemplate() { + @Override + protected void swap(int i, int j) { + final int o = orderdEntries[i]; + orderdEntries[i] = orderdEntries[j]; + orderdEntries[j] = o; + } + + @Override + protected int compare(int i, int j) { + final int ord1 = orderdEntries[i], ord2 = orderdEntries[j]; + return comp.compare(get(scratch1, ord1), get(scratch2, ord2)); + } + + @Override + protected void setPivot(int i) { + final int ord = orderdEntries[i]; + get(pivot, ord); + } + + @Override + protected int comparePivot(int j) { + final int ord = orderdEntries[j]; + return comp.compare(pivot, get(scratch2, ord)); + } + + private final BytesRef pivot = new BytesRef(), + scratch1 = new BytesRef(), scratch2 = new BytesRef(); + }.quickSort(0, size() - 1); + return orderdEntries; + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefList.java b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefList.java new file mode 100644 index 0000000..1cc737e --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefList.java @@ -0,0 +1,81 @@ +package org.apache.lucene.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with this + * work for additional information regarding copyright ownership. The ASF + * licenses this file to You under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class TestBytesRefList extends LuceneTestCase { + + public void testAppend() throws IOException { + BytesRefList list = new BytesRefList(); + List stringList = new ArrayList(); + int entries = atLeast(500); + BytesRef spare = new BytesRef(); + for (int i = 0; i < entries; i++) { + String randomRealisticUnicodeString = _TestUtil + .randomRealisticUnicodeString(random); + spare.copyChars(randomRealisticUnicodeString); + list.append(spare); + stringList.add(randomRealisticUnicodeString); + } + for (int i = 0; i < entries; i++) { + assertNotNull(list.get(spare, i)); + assertEquals("entry " + i + " doesn't match", stringList.get(i), + spare.utf8ToString()); + } + + // check random + for (int i = 0; i < entries; i++) { + int e = random.nextInt(entries); + assertNotNull(list.get(spare, e)); + assertEquals("entry " + i + " doesn't match", stringList.get(e), + spare.utf8ToString()); + } + for (int i = 0; i < 2; i++) { + + BytesRefIterator iterator = list.iterator(); + for (String string : stringList) { + assertEquals(string, iterator.next().utf8ToString()); + } + } + } + + public void testSort() { + BytesRefList list = new BytesRefList(); + List stringList = new ArrayList(); + int entries = atLeast(500); + BytesRef spare = new BytesRef(); + for (int i = 0; i < entries; i++) { + String randomRealisticUnicodeString = _TestUtil.randomRealisticUnicodeString(random); + spare.copyChars(randomRealisticUnicodeString); + list.append(spare); + stringList.add(randomRealisticUnicodeString); + } + Collections.sort(stringList); + int[] sortedOrds = list.sort(BytesRef.getUTF8SortedAsUTF16Comparator()); + for (int i = 0; i < entries; i++) { + assertNotNull(list.get(spare, sortedOrds[i])); + assertEquals("entry " + i + " doesn't match", stringList.get(i), + spare.utf8ToString()); + } + + } +} diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java b/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java index 4dee714..073da44 100755 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java @@ -16,7 +16,7 @@ package org.apache.lucene.search.spell; * limitations under the License. */ -import java.util.Iterator; +import org.apache.lucene.util.BytesRefIterator; /** * A simple interface representing a Dictionary. A Dictionary @@ -30,5 +30,5 @@ public interface Dictionary { * Return all words present in the dictionary * @return Iterator */ - Iterator getWordsIterator(); + BytesRefIterator getWordsIterator(); } diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java index c867253..bec1c31 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java @@ -18,12 +18,14 @@ package org.apache.lucene.search.spell; import java.io.IOException; +import java.util.Comparator; import java.util.Iterator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.Terms; import org.apache.lucene.index.MultiFields; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; @@ -50,14 +52,13 @@ public class HighFrequencyDictionary implements Dictionary { this.thresh = thresh; } - public final Iterator getWordsIterator() { + public final BytesRefIterator getWordsIterator() { return new HighFrequencyIterator(); } final class HighFrequencyIterator implements TermFreqIterator, SortedIterator { - private TermsEnum termsEnum; - private BytesRef actualTerm; - private boolean hasNextCalled; + private final BytesRef spare = new BytesRef(); + private final TermsEnum termsEnum; private int minNumDocs; HighFrequencyIterator() { @@ -65,6 +66,8 @@ public class HighFrequencyDictionary implements Dictionary { Terms terms = MultiFields.getTerms(reader, field); if (terms != null) { termsEnum = terms.iterator(null); + } else { + termsEnum = null; } minNumDocs = (int)(thresh * (float)reader.numDocs()); } catch (IOException e) { @@ -83,57 +86,27 @@ public class HighFrequencyDictionary implements Dictionary { throw new RuntimeException(ioe); } } - - public String next() { - if (!hasNextCalled && !hasNext()) { - return null; - } - hasNextCalled = false; - - if (actualTerm == null) { - return null; - } else { - UnicodeUtil.UTF8toUTF16(actualTerm, spare); - return spare.toString(); - } - } - - public boolean hasNext() { - if (hasNextCalled) { - return actualTerm != null; - } - hasNextCalled = true; - - if (termsEnum == null) { - return false; - } - while(true) { - try { - actualTerm = termsEnum.next(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - // if there are no words return false - if (actualTerm == null) { - return false; - } - - // got a valid term, does it pass the threshold? - try { - if (isFrequent(termsEnum.docFreq())) { - return true; - } - } catch (IOException ioe) { - throw new RuntimeException(ioe); + @Override + public BytesRef next() throws IOException { + if (termsEnum != null) { + BytesRef next = termsEnum.next(); + if (next != null && isFrequent(termsEnum.docFreq())) { + spare.copyBytes(next); + return spare; } } + return null; } - public void remove() { - throw new UnsupportedOperationException(); + @Override + public Comparator comparator() { + try { + return termsEnum.getComparator(); + } catch (IOException e) { + throw new RuntimeException(e); + } } } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java index 894dc0c..bd4afcc 100755 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java @@ -18,13 +18,7 @@ package org.apache.lucene.search.spell; */ import org.apache.lucene.index.IndexReader; - -import java.util.Iterator; - -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.index.Terms; import org.apache.lucene.index.MultiFields; @@ -49,50 +43,18 @@ public class LuceneDictionary implements Dictionary { this.field = field; } - public final Iterator getWordsIterator() { - return new LuceneIterator(); - } - - - final class LuceneIterator implements Iterator { - private TermsEnum termsEnum; - private BytesRef pendingTerm; - private final CharsRef spare = new CharsRef(); - - LuceneIterator() { - try { - final Terms terms = MultiFields.getTerms(reader, field); - if (terms != null) { - termsEnum = terms.iterator(null); - pendingTerm = termsEnum.next(); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - public String next() { - if (pendingTerm == null) { - return null; + public final BytesRefIterator getWordsIterator() { + + try { + final Terms terms = MultiFields.getTerms(reader, field); + if (terms != null) { + return terms.iterator(null); + } else { + return BytesRefIterator.EMPTY_ITERATOR; } - - UnicodeUtil.UTF8toUTF16(pendingTerm, spare); - - try { - pendingTerm = termsEnum.next(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - return spare.toString(); - } - - public boolean hasNext() { - return pendingTerm != null; - } - - public void remove() { - throw new UnsupportedOperationException(); + } catch (IOException e) { + throw new RuntimeException(e); } } + } diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java index 2eaac46..39b1b0e 100755 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java @@ -21,6 +21,10 @@ package org.apache.lucene.search.spell; import java.util.Iterator; import java.io.*; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; +import org.apache.lucene.util.IOUtils; + /** * Dictionary represented by a text file. @@ -33,8 +37,6 @@ import java.io.*; public class PlainTextDictionary implements Dictionary { private BufferedReader in; - private String line; - private boolean hasNextCalled; public PlainTextDictionary(File file) throws FileNotFoundException { in = new BufferedReader(new FileReader(file)); @@ -51,31 +53,37 @@ public class PlainTextDictionary implements Dictionary { in = new BufferedReader(reader); } - public Iterator getWordsIterator() { - return new fileIterator(); + public BytesRefIterator getWordsIterator() { + return new FileIterator(); } - final class fileIterator implements Iterator { - public String next() { - if (!hasNextCalled) { - hasNext(); + final class FileIterator implements BytesRefIterator { + private boolean done = false; + private final BytesRef spare = new BytesRef(); + @Override + public BytesRef next() throws IOException { + if (done) { + return null; } - hasNextCalled = false; - return line; - } - - public boolean hasNext() { - hasNextCalled = true; + boolean success = false; + BytesRef result; try { - line = in.readLine(); - } catch (IOException ex) { - throw new RuntimeException(ex); + String line; + if ((line = in.readLine()) != null) { + spare.copyChars(line); + result = spare; + } else { + done = true; + IOUtils.close(in); + result = null; + } + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(in); + } } - return (line != null) ? true : false; - } - - public void remove() { - throw new UnsupportedOperationException(); + return result; } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java b/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java index 7f2ea7a..694e0ca 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java @@ -17,12 +17,17 @@ package org.apache.lucene.search.spell; * limitations under the License. */ +import java.util.Comparator; import java.util.Iterator; +import org.apache.lucene.util.BytesRef; + /** * Marker interface to signal that elements coming from {@link Iterator} * come in ascending lexicographic order. */ public interface SortedIterator { + + public Comparator comparator(); } diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java b/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java index 1564a72..858804d 100755 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java @@ -46,6 +46,7 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.util.Version; @@ -510,20 +511,18 @@ public class SpellChecker implements java.io.Closeable { boolean isEmpty = termsEnums.isEmpty(); try { - Iterator iter = dict.getWordsIterator(); - BytesRef currentTerm = new BytesRef(); + BytesRefIterator iter = dict.getWordsIterator(); + BytesRef currentTerm; - terms: while (iter.hasNext()) { - String word = iter.next(); + terms: while ((currentTerm = iter.next()) != null) { + String word = currentTerm.utf8ToString(); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... } if (!isEmpty) { - // we have a non-empty index, check if the term exists - currentTerm.copyChars(word); for (TermsEnum te : termsEnums) { if (te.seekExact(currentTerm, false)) { continue terms; diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java b/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java index 6819ee8..4a6d431 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java @@ -17,16 +17,18 @@ package org.apache.lucene.search.spell; * limitations under the License. */ -import java.util.Iterator; +import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; -public interface TermFreqIterator extends Iterator { +public interface TermFreqIterator extends BytesRefIterator { public float freq(); public static class TermFreqIteratorWrapper implements TermFreqIterator { - private Iterator wrapped; + private BytesRefIterator wrapped; - public TermFreqIteratorWrapper(Iterator wrapped) { + public TermFreqIteratorWrapper(BytesRefIterator wrapped) { this.wrapped = wrapped; } @@ -34,17 +36,8 @@ public interface TermFreqIterator extends Iterator { return 1.0f; } - public boolean hasNext() { - return wrapped.hasNext(); + public BytesRef next() throws IOException { + return wrapped.next(); } - - public String next() { - return wrapped.next().toString(); - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java index 4578ac6..78a2d26 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java @@ -17,65 +17,47 @@ package org.apache.lucene.search.suggest; * limitations under the License. */ -import java.util.ArrayList; -import java.util.List; +import java.io.IOException; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefList; /** * This wrapper buffers incoming elements. */ public class BufferingTermFreqIteratorWrapper implements TermFreqIterator { - /** Entry in the buffer. */ - public static final class Entry implements Comparable { - String word; - float freq; - - public Entry(String word, float freq) { - this.word = word; - this.freq = freq; + protected BytesRefList entries = new BytesRefList(); + protected int curPos = -1; + protected float[] freqs = new float[1]; + private final BytesRef spare = new BytesRef(); + public BufferingTermFreqIteratorWrapper(TermFreqIterator source) throws IOException { + BytesRef spare; + int freqIndex = 0; + while((spare = source.next()) != null) { + entries.append(spare); + if (freqIndex >= freqs.length) { + freqs = ArrayUtil.grow(freqs, freqs.length+1); + } + freqs[freqIndex++] = source.freq(); } - - public int compareTo(Entry o) { - return word.compareTo(o.word); - } - } - - protected ArrayList entries = new ArrayList(); - - protected int curPos; - protected Entry curEntry; - - public BufferingTermFreqIteratorWrapper(TermFreqIterator source) { - // read all source data into buffer - while (source.hasNext()) { - String w = source.next(); - Entry e = new Entry(w, source.freq()); - entries.add(e); - } - curPos = 0; + } public float freq() { - return curEntry.freq; - } - - public boolean hasNext() { - return curPos < entries.size(); + return freqs[curPos]; } - public String next() { - curEntry = entries.get(curPos); - curPos++; - return curEntry.word; + @Override + public BytesRef next() throws IOException { + if (++curPos < entries.size()) { + entries.get(spare, curPos); + return spare; + } + return null; } - public void remove() { - throw new UnsupportedOperationException("remove is not supported"); - } - - public List entries() { - return entries; - } + } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java index b9cd5f5..15e833f 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java @@ -22,6 +22,8 @@ import java.io.*; import org.apache.lucene.search.spell.Dictionary; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; /** @@ -36,7 +38,7 @@ public class FileDictionary implements Dictionary { private BufferedReader in; private String line; - private boolean hasNextCalled; + private boolean done = false; public FileDictionary(InputStream dictFile) { in = new BufferedReader(new InputStreamReader(dictFile)); @@ -50,45 +52,39 @@ public class FileDictionary implements Dictionary { } public TermFreqIterator getWordsIterator() { - return new fileIterator(); + return new FileIterator(); } - final class fileIterator implements TermFreqIterator { + final class FileIterator implements TermFreqIterator { private float curFreq; + private final BytesRef spare = new BytesRef(); - public String next() { - if (!hasNextCalled) { - hasNext(); - } - hasNextCalled = false; - return line; - } - + public float freq() { return curFreq; } - public boolean hasNext() { - hasNextCalled = true; - try { - line = in.readLine(); - if (line != null) { - String[] fields = line.split("\t"); - if (fields.length > 1) { - curFreq = Float.parseFloat(fields[1]); - line = fields[0]; - } else { - curFreq = 1; - } + @Override + public BytesRef next() throws IOException { + if (done) { + return null; + } + line = in.readLine(); + if (line != null) { + String[] fields = line.split("\t"); + if (fields.length > 1) { + curFreq = Float.parseFloat(fields[1]); + spare.copyChars(fields[0]); + } else { + spare.copyChars(line); + curFreq = 1; } - } catch (IOException ex) { - throw new RuntimeException(ex); + return spare; + } else { + done = true; + IOUtils.close(in); + return null; } - return (line != null) ? true : false; - } - - public void remove() { - throw new UnsupportedOperationException(); } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java index ab20c01..1ab0eac 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java @@ -19,11 +19,13 @@ package org.apache.lucene.search.suggest; import java.io.File; import java.io.IOException; -import java.util.Iterator; +import java.io.InputStream; +import java.io.OutputStream; import java.util.List; import org.apache.lucene.search.spell.Dictionary; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.PriorityQueue; public abstract class Lookup { @@ -77,7 +79,7 @@ public abstract class Lookup { * {@link UnsortedTermFreqIteratorWrapper} in such case. */ public void build(Dictionary dict) throws IOException { - Iterator it = dict.getWordsIterator(); + BytesRefIterator it = dict.getWordsIterator(); TermFreqIterator tfit; if (it instanceof TermFreqIterator) { tfit = (TermFreqIterator)it; @@ -90,23 +92,6 @@ public abstract class Lookup { public abstract void build(TermFreqIterator tfit) throws IOException; /** - * Persist the constructed lookup data to a directory. Optional operation. - * @param storeDir directory where data can be stored. - * @return true if successful, false if unsuccessful or not supported. - * @throws IOException when fatal IO error occurs. - */ - public abstract boolean store(File storeDir) throws IOException; - - /** - * Discard current lookup data and load it from a previously saved copy. - * Optional operation. - * @param storeDir directory where lookup data was stored. - * @return true if completed successfully, false if unsuccessful or not supported. - * @throws IOException when fatal IO error occurs. - */ - public abstract boolean load(File storeDir) throws IOException; - - /** * Look up a key and return possible completion for this key. * @param key lookup key. Depending on the implementation this may be * a prefix, misspelling, or even infix. @@ -114,6 +99,7 @@ public abstract class Lookup { * @param num maximum number of results to return * @return a list of possible completions, with their relative weight (e.g. popularity) */ + // TODO: this should be a BytesRef API? public abstract List lookup(String key, boolean onlyMorePopular, int num); /** @@ -123,6 +109,7 @@ public abstract class Lookup { * @return true if new key is added, false if it already exists or operation * is not supported. */ + // TODO: this should be a BytesRef API? public abstract boolean add(String key, Object value); /** @@ -130,5 +117,40 @@ public abstract class Lookup { * @param key lookup key * @return associated value */ - public abstract Object get(String key); + // TODO: this should be a BytesRef API? + public abstract Object get(String key); + + /** + * Persist the constructed lookup data to a directory. Optional operation. + * @param output {@link OutputStream} to write the data to. + * @return true if successful, false if unsuccessful or not supported. + * @throws IOException when fatal IO error occurs. + */ + public abstract boolean store(OutputStream output) throws IOException; + + /** + * Discard current lookup data and load it from a previously saved copy. + * Optional operation. + * @param input the {@link InputStream} to load the lookup data. + * @return true if completed successfully, false if unsuccessful or not supported. + * @throws IOException when fatal IO error occurs. + */ + public abstract boolean load(InputStream input) throws IOException; + + /** + * Persist the constructed lookup data to a directory. Optional operation. + * @param storeDir directory where data can be stored. + * @return true if successful, false if unsuccessful or not supported. + * @throws IOException when fatal IO error occurs. + */ + public abstract boolean store(File storeDir) throws IOException; + + /** + * Discard current lookup data and load it from a previously saved copy. + * Optional operation. + * @param storeDir directory where lookup data was stored. + * @return true if completed successfully, false if unsuccessful or not supported. + * @throws IOException when fatal IO error occurs. + */ + public abstract boolean load(File storeDir) throws IOException; } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java index ddff06e..ffa4f9b 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java @@ -17,10 +17,12 @@ package org.apache.lucene.search.suggest; * limitations under the License. */ -import java.util.Collections; +import java.io.IOException; +import java.util.Comparator; import org.apache.lucene.search.spell.SortedIterator; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.BytesRef; /** * This wrapper buffers incoming elements and makes sure they are sorted in @@ -28,8 +30,35 @@ import org.apache.lucene.search.spell.TermFreqIterator; */ public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper implements SortedIterator { - public SortedTermFreqIteratorWrapper(TermFreqIterator source) { + private final int[] sortedOrds; + private int currentOrd = -1; + private final BytesRef spare = new BytesRef(); + private final Comparator comp; + + + public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator comp) throws IOException { super(source); - Collections.sort(entries); + this.sortedOrds = entries.sort(comp); + this.comp = comp; + } + + @Override + public float freq() { + return freqs[currentOrd]; + } + + @Override + public BytesRef next() throws IOException { + if (++curPos < entries.size()) { + return entries.get(spare, (currentOrd = sortedOrds[curPos])); + } + return null; + } + + @Override + public Comparator comparator() { + return comp; } + + } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java index d7b5b6e..d7b1b60 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java @@ -17,9 +17,11 @@ package org.apache.lucene.search.suggest; * limitations under the License. */ -import java.util.Collections; +import java.io.IOException; +import java.util.Random; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.BytesRef; /** * This wrapper buffers the incoming elements and makes sure they are in @@ -27,8 +29,34 @@ import org.apache.lucene.search.spell.TermFreqIterator; */ public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper { - public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) { + private final int[] ords; + private int currentOrd = -1; + private final BytesRef spare = new BytesRef(); + public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) throws IOException { super(source); - Collections.shuffle(entries); + ords = new int[entries.size()]; + Random random = new Random(); + for (int i = 0; i < ords.length; i++) { + ords[i] = i; + } + for (int i = 0; i < ords.length; i++) { + int randomPosition = random.nextInt(ords.length); + int temp = ords[i]; + ords[i] = ords[randomPosition]; + ords[randomPosition] = temp; + } + } + + @Override + public float freq() { + return freqs[currentOrd]; + } + + @Override + public BytesRef next() throws IOException { + if (++curPos < entries.size()) { + return entries.get(spare, (currentOrd = ords[curPos])); + } + return null; } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java index c6db1a8..4de0d00 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java @@ -19,6 +19,8 @@ package org.apache.lucene.search.suggest.fst; import java.io.File; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.List; @@ -29,6 +31,8 @@ import org.apache.lucene.search.suggest.fst.Sort.SortInfo; import org.apache.lucene.search.suggest.tst.TSTLookup; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.util.*; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.NoOutputs; @@ -158,20 +162,17 @@ public class FSTCompletionLookup extends Lookup { // If negative floats are allowed some trickery needs to be done to find their byte order. boolean success = false; try { - BytesRef tmp1 = new BytesRef(); byte [] buffer = new byte [0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); - while (tfit.hasNext()) { - String key = tfit.next(); - UnicodeUtil.UTF16toUTF8(key, 0, key.length(), tmp1); - - if (tmp1.length + 4 >= buffer.length) { - buffer = ArrayUtil.grow(buffer, tmp1.length + 4); + BytesRef spare; + while ((spare = tfit.next()) != null) { + if (spare.length + 4 >= buffer.length) { + buffer = ArrayUtil.grow(buffer, spare.length + 4); } output.reset(buffer); output.writeInt(FloatMagic.toSortable(tfit.freq())); - output.writeBytes(tmp1.bytes, tmp1.offset, tmp1.length); + output.writeBytes(spare.bytes, spare.offset, spare.length); writer.write(buffer, 0, output.getPosition()); } writer.close(); @@ -189,6 +190,7 @@ public class FSTCompletionLookup extends Lookup { int previousBucket = 0; float previousScore = 0; ByteArrayDataInput input = new ByteArrayDataInput(); + BytesRef tmp1 = new BytesRef(); BytesRef tmp2 = new BytesRef(); while (reader.read(tmp1)) { input.reset(tmp1.bytes); @@ -293,4 +295,30 @@ public class FSTCompletionLookup extends Lookup { normalCompletion.getFST().save(new File(storeDir, FILENAME)); return true; } + + @Override + public synchronized boolean store(OutputStream output) throws IOException { + + if (this.normalCompletion == null) + return false; + try { + normalCompletion.getFST().save(new OutputStreamDataOutput(output)); + } finally { + IOUtils.close(output); + } + return true; + } + + @Override + public synchronized boolean load(InputStream input) throws IOException { + try { + this.higherWeightsCompletion = new FSTCompletion(new FST( + new InputStreamDataInput(input), NoOutputs.getSingleton())); + this.normalCompletion = new FSTCompletion( + higherWeightsCompletion.getFST(), false, exactMatchFirst); + } finally { + IOUtils.close(input); + } + return true; + } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java index f7154b7..882b133 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java @@ -19,6 +19,8 @@ package org.apache.lucene.search.suggest.fst; import java.io.File; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -27,11 +29,12 @@ import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST.Arc; @@ -109,16 +112,14 @@ public class WFSTCompletionLookup extends Lookup { try { byte [] buffer = new byte [0]; ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); - while (iterator.hasNext()) { - String key = iterator.next(); - UnicodeUtil.UTF16toUTF8(key, 0, key.length(), scratch); - - if (scratch.length + 5 >= buffer.length) { - buffer = ArrayUtil.grow(buffer, scratch.length + 5); + BytesRef spare; + while ((spare = iterator.next()) != null) { + if (spare.length + 5 >= buffer.length) { + buffer = ArrayUtil.grow(buffer, spare.length + 5); } output.reset(buffer); - output.writeBytes(scratch.bytes, scratch.offset, scratch.length); + output.writeBytes(spare.bytes, spare.offset, spare.length); output.writeByte((byte)0); // separator: not used, just for sort order output.writeInt((int)encodeWeight(iterator.freq())); writer.write(buffer, 0, output.getPosition()); @@ -177,6 +178,26 @@ public class WFSTCompletionLookup extends Lookup { this.fst = FST.read(new File(storeDir, FILENAME), PositiveIntOutputs.getSingleton(true)); return true; } + + @Override + public boolean store(OutputStream output) throws IOException { + try { + fst.save(new OutputStreamDataOutput(output)); + } finally { + IOUtils.close(output); + } + return true; + } + + @Override + public boolean load(InputStream input) throws IOException { + try { + this.fst = new FST(new InputStreamDataInput(input), PositiveIntOutputs.getSingleton(true)); + } finally { + IOUtils.close(input); + } + return true; + } @Override public List lookup(String key, boolean onlyMorePopular, int num) { diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java index 4880261..82c2883 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/jaspell/JaspellLookup.java @@ -23,6 +23,8 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.List; @@ -31,6 +33,10 @@ import org.apache.lucene.search.spell.TermFreqIterator; import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.UnsortedTermFreqIteratorWrapper; import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie.TSTNode; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.UnicodeUtil; public class JaspellLookup extends Lookup { JaspellTernarySearchTrie trie = new JaspellTernarySearchTrie(); @@ -41,17 +47,22 @@ public class JaspellLookup extends Lookup { public void build(TermFreqIterator tfit) throws IOException { if (tfit instanceof SortedIterator) { // make sure it's unsorted + // WTF - this could result in yet another sorted iteration.... tfit = new UnsortedTermFreqIteratorWrapper(tfit); } trie = new JaspellTernarySearchTrie(); trie.setMatchAlmostDiff(editDistance); - while (tfit.hasNext()) { - String key = tfit.next(); + BytesRef spare; + final CharsRef charsSpare = new CharsRef(); + + while ((spare = tfit.next()) != null) { float freq = tfit.freq(); - if (key.length() == 0) { + if (spare.length == 0) { continue; } - trie.put(key, new Float(freq)); + charsSpare.grow(spare.length); + UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); + trie.put(charsSpare.toString(), new Float(freq)); } } @@ -114,15 +125,7 @@ public class JaspellLookup extends Lookup { if (!data.exists() || !data.canRead()) { return false; } - DataInputStream in = new DataInputStream(new FileInputStream(data)); - TSTNode root = trie.new TSTNode('\0', null); - try { - readRecursively(in, root); - trie.setRoot(root); - } finally { - in.close(); - } - return true; + return load(new FileInputStream(data)); } private void readRecursively(DataInputStream in, TSTNode node) throws IOException { @@ -153,19 +156,8 @@ public class JaspellLookup extends Lookup { if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { return false; } - TSTNode root = trie.getRoot(); - if (root == null) { // empty tree - return false; - } File data = new File(storeDir, FILENAME); - DataOutputStream out = new DataOutputStream(new FileOutputStream(data)); - try { - writeRecursively(out, root); - out.flush(); - } finally { - out.close(); - } - return true; + return store(new FileOutputStream(data)); } private void writeRecursively(DataOutputStream out, TSTNode node) throws IOException { @@ -186,4 +178,33 @@ public class JaspellLookup extends Lookup { writeRecursively(out, node.relatives[TSTNode.EQKID]); writeRecursively(out, node.relatives[TSTNode.HIKID]); } + + @Override + public boolean store(OutputStream output) throws IOException { + TSTNode root = trie.getRoot(); + if (root == null) { // empty tree + return false; + } + DataOutputStream out = new DataOutputStream(output); + try { + writeRecursively(out, root); + out.flush(); + } finally { + IOUtils.close(out); + } + return true; + } + + @Override + public boolean load(InputStream input) throws IOException { + DataInputStream in = new DataInputStream(input); + TSTNode root = trie.new TSTNode('\0', null); + try { + readRecursively(in, root); + trie.setRoot(root); + } finally { + IOUtils.close(in); + } + return true; + } } diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java index 54d24aa..01fe8b6 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java +++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/tst/TSTLookup.java @@ -23,6 +23,8 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.util.ArrayList; import java.util.List; @@ -30,6 +32,10 @@ import org.apache.lucene.search.suggest.Lookup; import org.apache.lucene.search.suggest.SortedTermFreqIteratorWrapper; import org.apache.lucene.search.spell.SortedIterator; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.UnicodeUtil; public class TSTLookup extends Lookup { TernaryTreeNode root = new TernaryTreeNode(); @@ -39,15 +45,19 @@ public class TSTLookup extends Lookup { public void build(TermFreqIterator tfit) throws IOException { root = new TernaryTreeNode(); // buffer first - if (!(tfit instanceof SortedIterator)) { - // make sure it's sorted - tfit = new SortedTermFreqIteratorWrapper(tfit); + if ((!(tfit instanceof SortedIterator)) || ((SortedIterator)tfit).comparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) { + // make sure it's sorted and the comparator uses UTF16 sort order + tfit = new SortedTermFreqIteratorWrapper(tfit, BytesRef.getUTF8SortedAsUTF16Comparator()); } ArrayList tokens = new ArrayList(); ArrayList vals = new ArrayList(); - while (tfit.hasNext()) { - tokens.add(tfit.next()); + BytesRef spare; + CharsRef charsSpare = new CharsRef(); + while ((spare = tfit.next()) != null) { + charsSpare.grow(spare.length); + UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare); + tokens.add(charsSpare.toString()); vals.add(new Float(tfit.freq())); } autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root); @@ -113,14 +123,7 @@ public class TSTLookup extends Lookup { if (!data.exists() || !data.canRead()) { return false; } - DataInputStream in = new DataInputStream(new FileInputStream(data)); - root = new TernaryTreeNode(); - try { - readRecursively(in, root); - } finally { - in.close(); - } - return true; + return load(new FileInputStream(data)); } // pre-order traversal @@ -153,14 +156,7 @@ public class TSTLookup extends Lookup { return false; } File data = new File(storeDir, FILENAME); - DataOutputStream out = new DataOutputStream(new FileOutputStream(data)); - try { - writeRecursively(out, root); - out.flush(); - } finally { - out.close(); - } - return true; + return store(new FileOutputStream(data)); } // pre-order traversal @@ -188,4 +184,28 @@ public class TSTLookup extends Lookup { writeRecursively(out, node.hiKid); } } + + @Override + public synchronized boolean store(OutputStream output) throws IOException { + DataOutputStream out = new DataOutputStream(output); + try { + writeRecursively(out, root); + out.flush(); + } finally { + IOUtils.close(output); + } + return true; + } + + @Override + public synchronized boolean load(InputStream input) throws IOException { + DataInputStream in = new DataInputStream(input); + root = new TernaryTreeNode(); + try { + readRecursively(in, root); + } finally { + IOUtils.close(in); + } + return true; + } } diff --git a/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java b/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java index c18b2c5..4373ba3 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java +++ b/modules/suggest/src/test/org/apache/lucene/search/spell/TestLuceneDictionary.java @@ -18,15 +18,17 @@ package org.apache.lucene.search.spell; */ import java.io.IOException; -import java.util.Iterator; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.LuceneTestCase; /** @@ -40,7 +42,8 @@ public class TestLuceneDictionary extends LuceneTestCase { private IndexReader indexReader = null; private LuceneDictionary ld; - private Iterator it; + private BytesRefIterator it; + private BytesRef spare = new BytesRef(); @Override public void setUp() throws Exception { @@ -84,13 +87,12 @@ public class TestLuceneDictionary extends LuceneTestCase { public void testFieldNonExistent() throws IOException { try { - indexReader = IndexReader.open(store); + indexReader = DirectoryReader.open(store); ld = new LuceneDictionary(indexReader, "nonexistent_field"); it = ld.getWordsIterator(); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); + assertNull("More elements than expected", spare = it.next()); } finally { if (indexReader != null) { indexReader.close(); } } @@ -98,15 +100,13 @@ public class TestLuceneDictionary extends LuceneTestCase { public void testFieldAaa() throws IOException { try { - indexReader = IndexReader.open(store); + indexReader = DirectoryReader.open(store); ld = new LuceneDictionary(indexReader, "aaa"); it = ld.getWordsIterator(); - - assertTrue("First element doesn't exist.", it.hasNext()); - assertTrue("First element isn't correct", it.next().equals("foo")); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); + assertNotNull("First element doesn't exist.", spare = it.next()); + assertTrue("First element isn't correct", spare.utf8ToString().equals("foo")); + assertNull("More elements than expected", it.next()); } finally { if (indexReader != null) { indexReader.close(); } } @@ -114,24 +114,22 @@ public class TestLuceneDictionary extends LuceneTestCase { public void testFieldContents_1() throws IOException { try { - indexReader = IndexReader.open(store); + indexReader = DirectoryReader.open(store); ld = new LuceneDictionary(indexReader, "contents"); it = ld.getWordsIterator(); - assertTrue("First element doesn't exist.", it.hasNext()); - assertTrue("First element isn't correct", it.next().equals("Jerry")); - assertTrue("Second element doesn't exist.", it.hasNext()); - assertTrue("Second element isn't correct", it.next().equals("Tom")); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); + assertNotNull("First element doesn't exist.", spare = it.next()); + assertTrue("First element isn't correct", spare.utf8ToString().equals("Jerry")); + assertNotNull("Second element doesn't exist.", spare = it.next()); + assertTrue("Second element isn't correct", spare.utf8ToString().equals("Tom")); + assertNull("More elements than expected", it.next()); ld = new LuceneDictionary(indexReader, "contents"); it = ld.getWordsIterator(); int counter = 2; - while (it.hasNext()) { - it.next(); + while (it.next() != null) { counter--; } @@ -144,30 +142,15 @@ public class TestLuceneDictionary extends LuceneTestCase { public void testFieldContents_2() throws IOException { try { - indexReader = IndexReader.open(store); + indexReader = DirectoryReader.open(store); ld = new LuceneDictionary(indexReader, "contents"); it = ld.getWordsIterator(); - // hasNext() should have no side effects - assertTrue("First element isn't were it should be.", it.hasNext()); - assertTrue("First element isn't were it should be.", it.hasNext()); - assertTrue("First element isn't were it should be.", it.hasNext()); - // just iterate through words - assertTrue("First element isn't correct", it.next().equals("Jerry")); - assertTrue("Second element isn't correct", it.next().equals("Tom")); - assertTrue("Nonexistent element is really null", it.next() == null); - - // hasNext() should still have no side effects ... - assertFalse("There should be any more elements", it.hasNext()); - assertFalse("There should be any more elements", it.hasNext()); - assertFalse("There should be any more elements", it.hasNext()); - - // .. and there are really no more words - assertTrue("Nonexistent element is really null", it.next() == null); - assertTrue("Nonexistent element is really null", it.next() == null); - assertTrue("Nonexistent element is really null", it.next() == null); + assertEquals("First element isn't correct", "Jerry", it.next().utf8ToString()); + assertEquals("Second element isn't correct", "Tom", it.next().utf8ToString()); + assertNull("Nonexistent element is really null", it.next()); } finally { if (indexReader != null) { indexReader.close(); } @@ -176,15 +159,14 @@ public class TestLuceneDictionary extends LuceneTestCase { public void testFieldZzz() throws IOException { try { - indexReader = IndexReader.open(store); + indexReader = DirectoryReader.open(store); ld = new LuceneDictionary(indexReader, "zzz"); it = ld.getWordsIterator(); - assertTrue("First element doesn't exist.", it.hasNext()); - assertTrue("First element isn't correct", it.next().equals("bar")); - assertFalse("More elements than expected", it.hasNext()); - assertTrue("Nonexistent element is really null", it.next() == null); + assertNotNull("First element doesn't exist.", spare = it.next()); + assertEquals("First element isn't correct", "bar", spare.utf8ToString()); + assertNull("More elements than expected", it.next()); } finally { if (indexReader != null) { indexReader.close(); } @@ -194,7 +176,7 @@ public class TestLuceneDictionary extends LuceneTestCase { public void testSpellchecker() throws IOException { Directory dir = newDirectory(); SpellChecker sc = new SpellChecker(dir); - indexReader = IndexReader.open(store); + indexReader = DirectoryReader.open(store); sc.indexDictionary(new LuceneDictionary(indexReader, "contents"), newIndexWriterConfig(TEST_VERSION_CURRENT, null), false); String[] suggestions = sc.suggestSimilar("Tam", 1); assertEquals(1, suggestions.length); diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java index 5ce7243..0d56526 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/LookupBenchmarkTest.java @@ -191,7 +191,7 @@ public class LookupBenchmarkTest extends LuceneTestCase { final List input = new ArrayList(benchmarkInput.size()); for (TermFreq tf : benchmarkInput) { - input.add(tf.term.substring(0, Math.min(tf.term.length(), + input.add(tf.term.utf8ToString().substring(0, Math.min(tf.term.length, minPrefixLen + random.nextInt(maxPrefixLen - minPrefixLen + 1)))); } diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java index d476c58..c4ab93e 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/PersistenceTest.java @@ -75,11 +75,11 @@ public class PersistenceTest extends LuceneTestCase { // Assert validity. float previous = Float.NEGATIVE_INFINITY; for (TermFreq k : keys) { - Float val = (Float) lookup.get(k.term); - assertNotNull(k.term, val); + Float val = (Float) lookup.get(k.term.utf8ToString()); + assertNotNull(k.term.utf8ToString(), val); if (supportsExactWeights) { - assertEquals(k.term, Float.valueOf(k.v), val); + assertEquals(k.term.utf8ToString(), Float.valueOf(k.v), val); } else { assertTrue(val + ">=" + previous, val >= previous); previous = val.floatValue(); diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java index 36396cc..29d0433 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreq.java @@ -1,5 +1,7 @@ package org.apache.lucene.search.suggest; +import org.apache.lucene.util.BytesRef; + /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -18,10 +20,14 @@ package org.apache.lucene.search.suggest; */ public final class TermFreq { - public final String term; + public final BytesRef term; public final float v; public TermFreq(String term, float v) { + this(new BytesRef(term), v); + } + + public TermFreq(BytesRef term, float v) { this.term = term; this.v = v; } diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java index 77844c7..27cbb66 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/TermFreqArrayIterator.java @@ -17,10 +17,12 @@ package org.apache.lucene.search.suggest; * limitations under the License. */ +import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.util.BytesRef; /** * A {@link TermFreqIterator} over a sequence of {@link TermFreq}s. @@ -28,6 +30,7 @@ import org.apache.lucene.search.spell.TermFreqIterator; public final class TermFreqArrayIterator implements TermFreqIterator { private final Iterator i; private TermFreq current; + private final BytesRef spare = new BytesRef(); public TermFreqArrayIterator(Iterator i) { this.i = i; @@ -44,14 +47,14 @@ public final class TermFreqArrayIterator implements TermFreqIterator { public float freq() { return current.v; } - - public boolean hasNext() { - return i.hasNext(); - } - - public String next() { - return (current = i.next()).term; - } - public void remove() { throw new UnsupportedOperationException(); } + @Override + public BytesRef next() throws IOException { + if (i.hasNext()) { + current = i.next(); + spare.copyBytes(current.term); + return spare; + } + return null; + } } \ No newline at end of file diff --git a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java index 8904e00..354b843 100644 --- a/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java +++ b/modules/suggest/src/test/org/apache/lucene/search/suggest/fst/FSTCompletionTest.java @@ -40,7 +40,7 @@ public class FSTCompletionTest extends LuceneTestCase { FSTCompletionBuilder builder = new FSTCompletionBuilder(); for (TermFreq tf : evalKeys()) { - builder.add(new BytesRef(tf.term), (int) tf.v); + builder.add(tf.term, (int) tf.v); } completion = builder.build(); completionAlphabetical = new FSTCompletion(completion.getFST(), false, true); @@ -167,7 +167,7 @@ public class FSTCompletionTest extends LuceneTestCase { // are. Float previous = null; for (TermFreq tf : keys) { - Float current = lookup.get(tf.term); + Float current = lookup.get(tf.term.utf8ToString()); if (previous != null) { assertEquals(previous, current); } @@ -183,8 +183,8 @@ public class FSTCompletionTest extends LuceneTestCase { lookup.build(new TermFreqArrayIterator(input)); for (TermFreq tf : input) { - assertTrue("Not found: " + tf.term, lookup.get(tf.term) != null); - assertEquals(tf.term, lookup.lookup(tf.term, true, 1).get(0).key); + assertTrue("Not found: " + tf.term, lookup.get(tf.term.utf8ToString()) != null); + assertEquals(tf.term, lookup.lookup(tf.term.utf8ToString(), true, 1).get(0).key); } List result = lookup.lookup("wit", true, 5); @@ -211,7 +211,7 @@ public class FSTCompletionTest extends LuceneTestCase { lookup.build(new TermFreqArrayIterator(freqs.toArray(new TermFreq[freqs.size()]))); for (TermFreq tf : freqs) { - final String term = tf.term; + final String term = tf.term.utf8ToString(); for (int i = 1; i < term.length(); i++) { String prefix = term.substring(0, i); for (LookupResult lr : lookup.lookup(prefix, true, 10)) {