diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/TermsEnum.java
index d154200..9957ec1 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermsEnum.java
@@ -23,6 +23,7 @@ import java.util.Comparator;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefIterator;
/** Iterator to seek ({@link #seekCeil(BytesRef)}, {@link
* #seekExact(BytesRef,boolean)}) or step through ({@link
@@ -40,7 +41,7 @@ import org.apache.lucene.util.BytesRef;
* of the seek methods.
*
* @lucene.experimental */
-public abstract class TermsEnum {
+public abstract class TermsEnum implements BytesRefIterator {
private AttributeSource atts = null;
@@ -114,14 +115,6 @@ public abstract class TermsEnum {
}
}
- /** Increments the enumeration to the next term.
- * Returns the resulting term, or null if the end was
- * hit (which means the enum is unpositioned). The
- * returned BytesRef may be re-used across calls to next.
- * After this method returns null, do not call it again:
- * the results are undefined. */
- public abstract BytesRef next() throws IOException;
-
/** Returns current term. Do not call this when the enum
* is unpositioned. */
public abstract BytesRef term() throws IOException;
diff --git a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
index 1d8ebd2..cb56a51 100644
--- a/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
+++ b/lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
@@ -281,6 +281,37 @@ public final class ByteBlockPool {
}
/**
+ *
+ */
+ public final BytesRef copyFrom(final BytesRef bytes) {
+ final int length = bytes.length;
+ final int offset = bytes.offset;
+ bytes.offset = 0;
+ bytes.grow(length);
+ int bufferIndex = offset >> BYTE_BLOCK_SHIFT;
+ byte[] buffer = buffers[bufferIndex];
+ int pos = offset & BYTE_BLOCK_MASK;
+ int overflow = (pos + length) - BYTE_BLOCK_SIZE;
+ do {
+ if (overflow <= 0) {
+ System.arraycopy(buffer, pos, bytes.bytes, bytes.offset, bytes.length);
+ bytes.length = length;
+ bytes.offset = 0;
+ break;
+ } else {
+ final int bytesToCopy = length - overflow;
+ System.arraycopy(buffer, pos, bytes.bytes, bytes.offset, bytesToCopy);
+ pos = 0;
+ bytes.length -= bytesToCopy;
+ bytes.offset += bytesToCopy;
+ buffer = buffers[bufferIndex];
+ overflow = overflow - BYTE_BLOCK_SIZE;
+ }
+ } while (true);
+ return bytes;
+ }
+
+ /**
* Writes the pools content to the given {@link DataOutput}
*/
public final void writePool(final DataOutput out) throws IOException {
diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefIterator.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefIterator.java
new file mode 100644
index 0000000..5809bb5
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefIterator.java
@@ -0,0 +1,52 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+/**
+ * A simple iterator interface for {@link BytesRef} iteration
+ *
+ */
+public interface BytesRefIterator {
+
+ public static final BytesRefIterator EMPTY_ITERATOR = new EmptyBytesRefIterator();
+
+ /**
+ * Increments the iteration to the next {@link BytesRef} in the iterator.
+ * Returns the resulting {@link BytesRef} or null if the end of
+ * the iterator is reached. The returned BytesRef may be re-used across calls
+ * to next. After this method returns null, do not call it again: the results
+ * are undefined.
+ *
+ * @return the next {@link BytesRef} in the iterator or null if
+ * the end of the iterator is reached.
+ * @throws IOException
+ */
+ public BytesRef next() throws IOException;
+
+ public final static class EmptyBytesRefIterator implements BytesRefIterator {
+
+ @Override
+ public BytesRef next() throws IOException {
+ return null;
+ }
+
+ }
+
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/BytesRefList.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefList.java
new file mode 100644
index 0000000..2630a7c
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefList.java
@@ -0,0 +1,119 @@
+package org.apache.lucene.util;
+
+import java.io.IOException;
+import java.util.Comparator;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+public class BytesRefList {
+
+ private final ByteBlockPool pool;
+ private int[] offsets = new int[1];
+ private int currentElement = 0;
+ private int currentOffset = 0;
+
+ public BytesRefList() {
+ this(new ByteBlockPool(new ByteBlockPool.DirectAllocator()));
+ }
+
+ public BytesRefList(ByteBlockPool pool) {
+ this.pool = pool;
+ pool.nextBuffer();
+ }
+
+ public int append(BytesRef bytes) {
+ if (currentElement >= offsets.length) {
+ offsets = ArrayUtil.grow(offsets, offsets.length + 1);
+ }
+ pool.copy(bytes);
+ offsets[currentElement++] = currentOffset;
+ currentOffset += bytes.length;
+ return currentElement;
+ }
+
+ public int size() {
+ return currentElement;
+ }
+
+ public BytesRef get(BytesRef bytes, int pos) {
+ if (currentElement > pos) {
+ bytes.offset = offsets[pos];
+ bytes.length = pos == currentElement - 1 ? currentOffset - bytes.offset
+ : offsets[pos + 1] - bytes.offset;
+ pool.copyFrom(bytes);
+ return bytes;
+ }
+ throw new IndexOutOfBoundsException("index " + pos
+ + " must be less than the size: " + currentElement);
+
+ }
+
+ public BytesRefIterator iterator() {
+ final int numElements = currentElement;
+
+ return new BytesRefIterator() {
+ private final BytesRef spare = new BytesRef();
+ private int pos = 0;
+
+ @Override
+ public BytesRef next() throws IOException {
+ if (pos < numElements) {
+ get(spare, pos++);
+ return spare;
+ }
+ return null;
+ }
+ };
+ }
+
+ public int[] sort(final Comparator comp) {
+ final int[] orderdEntries = new int[size()];
+ for (int i = 0; i < orderdEntries.length; i++) {
+ orderdEntries[i] = i;
+ }
+ new SorterTemplate() {
+ @Override
+ protected void swap(int i, int j) {
+ final int o = orderdEntries[i];
+ orderdEntries[i] = orderdEntries[j];
+ orderdEntries[j] = o;
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ final int ord1 = orderdEntries[i], ord2 = orderdEntries[j];
+ return comp.compare(get(scratch1, ord1), get(scratch2, ord2));
+ }
+
+ @Override
+ protected void setPivot(int i) {
+ final int ord = orderdEntries[i];
+ get(pivot, ord);
+ }
+
+ @Override
+ protected int comparePivot(int j) {
+ final int ord = orderdEntries[j];
+ return comp.compare(pivot, get(scratch2, ord));
+ }
+
+ private final BytesRef pivot = new BytesRef(),
+ scratch1 = new BytesRef(), scratch2 = new BytesRef();
+ }.quickSort(0, size() - 1);
+ return orderdEntries;
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestBytesRefList.java b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefList.java
new file mode 100644
index 0000000..1cc737e
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefList.java
@@ -0,0 +1,81 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership. The ASF
+ * licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+public class TestBytesRefList extends LuceneTestCase {
+
+ public void testAppend() throws IOException {
+ BytesRefList list = new BytesRefList();
+ List stringList = new ArrayList();
+ int entries = atLeast(500);
+ BytesRef spare = new BytesRef();
+ for (int i = 0; i < entries; i++) {
+ String randomRealisticUnicodeString = _TestUtil
+ .randomRealisticUnicodeString(random);
+ spare.copyChars(randomRealisticUnicodeString);
+ list.append(spare);
+ stringList.add(randomRealisticUnicodeString);
+ }
+ for (int i = 0; i < entries; i++) {
+ assertNotNull(list.get(spare, i));
+ assertEquals("entry " + i + " doesn't match", stringList.get(i),
+ spare.utf8ToString());
+ }
+
+ // check random
+ for (int i = 0; i < entries; i++) {
+ int e = random.nextInt(entries);
+ assertNotNull(list.get(spare, e));
+ assertEquals("entry " + i + " doesn't match", stringList.get(e),
+ spare.utf8ToString());
+ }
+ for (int i = 0; i < 2; i++) {
+
+ BytesRefIterator iterator = list.iterator();
+ for (String string : stringList) {
+ assertEquals(string, iterator.next().utf8ToString());
+ }
+ }
+ }
+
+ public void testSort() {
+ BytesRefList list = new BytesRefList();
+ List stringList = new ArrayList();
+ int entries = atLeast(500);
+ BytesRef spare = new BytesRef();
+ for (int i = 0; i < entries; i++) {
+ String randomRealisticUnicodeString = _TestUtil.randomRealisticUnicodeString(random);
+ spare.copyChars(randomRealisticUnicodeString);
+ list.append(spare);
+ stringList.add(randomRealisticUnicodeString);
+ }
+ Collections.sort(stringList);
+ int[] sortedOrds = list.sort(BytesRef.getUTF8SortedAsUTF16Comparator());
+ for (int i = 0; i < entries; i++) {
+ assertNotNull(list.get(spare, sortedOrds[i]));
+ assertEquals("entry " + i + " doesn't match", stringList.get(i),
+ spare.utf8ToString());
+ }
+
+ }
+}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java b/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java
index 4dee714..073da44 100755
--- a/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/spell/Dictionary.java
@@ -16,7 +16,7 @@ package org.apache.lucene.search.spell;
* limitations under the License.
*/
-import java.util.Iterator;
+import org.apache.lucene.util.BytesRefIterator;
/**
* A simple interface representing a Dictionary. A Dictionary
@@ -30,5 +30,5 @@ public interface Dictionary {
* Return all words present in the dictionary
* @return Iterator
*/
- Iterator getWordsIterator();
+ BytesRefIterator getWordsIterator();
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java
index c867253..bec1c31 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/spell/HighFrequencyDictionary.java
@@ -18,12 +18,14 @@
package org.apache.lucene.search.spell;
import java.io.IOException;
+import java.util.Comparator;
import java.util.Iterator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
@@ -50,14 +52,13 @@ public class HighFrequencyDictionary implements Dictionary {
this.thresh = thresh;
}
- public final Iterator getWordsIterator() {
+ public final BytesRefIterator getWordsIterator() {
return new HighFrequencyIterator();
}
final class HighFrequencyIterator implements TermFreqIterator, SortedIterator {
- private TermsEnum termsEnum;
- private BytesRef actualTerm;
- private boolean hasNextCalled;
+ private final BytesRef spare = new BytesRef();
+ private final TermsEnum termsEnum;
private int minNumDocs;
HighFrequencyIterator() {
@@ -65,6 +66,8 @@ public class HighFrequencyDictionary implements Dictionary {
Terms terms = MultiFields.getTerms(reader, field);
if (terms != null) {
termsEnum = terms.iterator(null);
+ } else {
+ termsEnum = null;
}
minNumDocs = (int)(thresh * (float)reader.numDocs());
} catch (IOException e) {
@@ -83,57 +86,27 @@ public class HighFrequencyDictionary implements Dictionary {
throw new RuntimeException(ioe);
}
}
-
- public String next() {
- if (!hasNextCalled && !hasNext()) {
- return null;
- }
- hasNextCalled = false;
-
- if (actualTerm == null) {
- return null;
- } else {
- UnicodeUtil.UTF8toUTF16(actualTerm, spare);
- return spare.toString();
- }
- }
-
- public boolean hasNext() {
- if (hasNextCalled) {
- return actualTerm != null;
- }
- hasNextCalled = true;
-
- if (termsEnum == null) {
- return false;
- }
- while(true) {
- try {
- actualTerm = termsEnum.next();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- // if there are no words return false
- if (actualTerm == null) {
- return false;
- }
-
- // got a valid term, does it pass the threshold?
- try {
- if (isFrequent(termsEnum.docFreq())) {
- return true;
- }
- } catch (IOException ioe) {
- throw new RuntimeException(ioe);
+ @Override
+ public BytesRef next() throws IOException {
+ if (termsEnum != null) {
+ BytesRef next = termsEnum.next();
+ if (next != null && isFrequent(termsEnum.docFreq())) {
+ spare.copyBytes(next);
+ return spare;
}
}
+ return null;
}
- public void remove() {
- throw new UnsupportedOperationException();
+ @Override
+ public Comparator comparator() {
+ try {
+ return termsEnum.getComparator();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
}
}
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java
index 894dc0c..bd4afcc 100755
--- a/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/spell/LuceneDictionary.java
@@ -18,13 +18,7 @@ package org.apache.lucene.search.spell;
*/
import org.apache.lucene.index.IndexReader;
-
-import java.util.Iterator;
-
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.MultiFields;
@@ -49,50 +43,18 @@ public class LuceneDictionary implements Dictionary {
this.field = field;
}
- public final Iterator getWordsIterator() {
- return new LuceneIterator();
- }
-
-
- final class LuceneIterator implements Iterator {
- private TermsEnum termsEnum;
- private BytesRef pendingTerm;
- private final CharsRef spare = new CharsRef();
-
- LuceneIterator() {
- try {
- final Terms terms = MultiFields.getTerms(reader, field);
- if (terms != null) {
- termsEnum = terms.iterator(null);
- pendingTerm = termsEnum.next();
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- public String next() {
- if (pendingTerm == null) {
- return null;
+ public final BytesRefIterator getWordsIterator() {
+
+ try {
+ final Terms terms = MultiFields.getTerms(reader, field);
+ if (terms != null) {
+ return terms.iterator(null);
+ } else {
+ return BytesRefIterator.EMPTY_ITERATOR;
}
-
- UnicodeUtil.UTF8toUTF16(pendingTerm, spare);
-
- try {
- pendingTerm = termsEnum.next();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- return spare.toString();
- }
-
- public boolean hasNext() {
- return pendingTerm != null;
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
}
}
+
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java
index 2eaac46..39b1b0e 100755
--- a/modules/suggest/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java
@@ -21,6 +21,10 @@ package org.apache.lucene.search.spell;
import java.util.Iterator;
import java.io.*;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefIterator;
+import org.apache.lucene.util.IOUtils;
+
/**
* Dictionary represented by a text file.
@@ -33,8 +37,6 @@ import java.io.*;
public class PlainTextDictionary implements Dictionary {
private BufferedReader in;
- private String line;
- private boolean hasNextCalled;
public PlainTextDictionary(File file) throws FileNotFoundException {
in = new BufferedReader(new FileReader(file));
@@ -51,31 +53,37 @@ public class PlainTextDictionary implements Dictionary {
in = new BufferedReader(reader);
}
- public Iterator getWordsIterator() {
- return new fileIterator();
+ public BytesRefIterator getWordsIterator() {
+ return new FileIterator();
}
- final class fileIterator implements Iterator {
- public String next() {
- if (!hasNextCalled) {
- hasNext();
+ final class FileIterator implements BytesRefIterator {
+ private boolean done = false;
+ private final BytesRef spare = new BytesRef();
+ @Override
+ public BytesRef next() throws IOException {
+ if (done) {
+ return null;
}
- hasNextCalled = false;
- return line;
- }
-
- public boolean hasNext() {
- hasNextCalled = true;
+ boolean success = false;
+ BytesRef result;
try {
- line = in.readLine();
- } catch (IOException ex) {
- throw new RuntimeException(ex);
+ String line;
+ if ((line = in.readLine()) != null) {
+ spare.copyChars(line);
+ result = spare;
+ } else {
+ done = true;
+ IOUtils.close(in);
+ result = null;
+ }
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(in);
+ }
}
- return (line != null) ? true : false;
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
+ return result;
}
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java b/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java
index 7f2ea7a..694e0ca 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/spell/SortedIterator.java
@@ -17,12 +17,17 @@ package org.apache.lucene.search.spell;
* limitations under the License.
*/
+import java.util.Comparator;
import java.util.Iterator;
+import org.apache.lucene.util.BytesRef;
+
/**
* Marker interface to signal that elements coming from {@link Iterator}
* come in ascending lexicographic order.
*/
public interface SortedIterator {
+
+ public Comparator comparator();
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java b/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
index 1564a72..858804d 100755
--- a/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
@@ -46,6 +46,7 @@ import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.Version;
@@ -510,20 +511,18 @@ public class SpellChecker implements java.io.Closeable {
boolean isEmpty = termsEnums.isEmpty();
try {
- Iterator iter = dict.getWordsIterator();
- BytesRef currentTerm = new BytesRef();
+ BytesRefIterator iter = dict.getWordsIterator();
+ BytesRef currentTerm;
- terms: while (iter.hasNext()) {
- String word = iter.next();
+ terms: while ((currentTerm = iter.next()) != null) {
+ String word = currentTerm.utf8ToString();
int len = word.length();
if (len < 3) {
continue; // too short we bail but "too long" is fine...
}
if (!isEmpty) {
- // we have a non-empty index, check if the term exists
- currentTerm.copyChars(word);
for (TermsEnum te : termsEnums) {
if (te.seekExact(currentTerm, false)) {
continue terms;
diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java b/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java
index 6819ee8..4a6d431 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/spell/TermFreqIterator.java
@@ -17,16 +17,18 @@ package org.apache.lucene.search.spell;
* limitations under the License.
*/
-import java.util.Iterator;
+import java.io.IOException;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefIterator;
-public interface TermFreqIterator extends Iterator {
+public interface TermFreqIterator extends BytesRefIterator {
public float freq();
public static class TermFreqIteratorWrapper implements TermFreqIterator {
- private Iterator wrapped;
+ private BytesRefIterator wrapped;
- public TermFreqIteratorWrapper(Iterator wrapped) {
+ public TermFreqIteratorWrapper(BytesRefIterator wrapped) {
this.wrapped = wrapped;
}
@@ -34,17 +36,8 @@ public interface TermFreqIterator extends Iterator {
return 1.0f;
}
- public boolean hasNext() {
- return wrapped.hasNext();
+ public BytesRef next() throws IOException {
+ return wrapped.next();
}
-
- public String next() {
- return wrapped.next().toString();
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
-
}
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java
index 4578ac6..78a2d26 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/BufferingTermFreqIteratorWrapper.java
@@ -17,65 +17,47 @@ package org.apache.lucene.search.suggest;
* limitations under the License.
*/
-import java.util.ArrayList;
-import java.util.List;
+import java.io.IOException;
import org.apache.lucene.search.spell.TermFreqIterator;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefList;
/**
* This wrapper buffers incoming elements.
*/
public class BufferingTermFreqIteratorWrapper implements TermFreqIterator {
- /** Entry in the buffer. */
- public static final class Entry implements Comparable {
- String word;
- float freq;
-
- public Entry(String word, float freq) {
- this.word = word;
- this.freq = freq;
+ protected BytesRefList entries = new BytesRefList();
+ protected int curPos = -1;
+ protected float[] freqs = new float[1];
+ private final BytesRef spare = new BytesRef();
+ public BufferingTermFreqIteratorWrapper(TermFreqIterator source) throws IOException {
+ BytesRef spare;
+ int freqIndex = 0;
+ while((spare = source.next()) != null) {
+ entries.append(spare);
+ if (freqIndex >= freqs.length) {
+ freqs = ArrayUtil.grow(freqs, freqs.length+1);
+ }
+ freqs[freqIndex++] = source.freq();
}
-
- public int compareTo(Entry o) {
- return word.compareTo(o.word);
- }
- }
-
- protected ArrayList entries = new ArrayList();
-
- protected int curPos;
- protected Entry curEntry;
-
- public BufferingTermFreqIteratorWrapper(TermFreqIterator source) {
- // read all source data into buffer
- while (source.hasNext()) {
- String w = source.next();
- Entry e = new Entry(w, source.freq());
- entries.add(e);
- }
- curPos = 0;
+
}
public float freq() {
- return curEntry.freq;
- }
-
- public boolean hasNext() {
- return curPos < entries.size();
+ return freqs[curPos];
}
- public String next() {
- curEntry = entries.get(curPos);
- curPos++;
- return curEntry.word;
+ @Override
+ public BytesRef next() throws IOException {
+ if (++curPos < entries.size()) {
+ entries.get(spare, curPos);
+ return spare;
+ }
+ return null;
}
- public void remove() {
- throw new UnsupportedOperationException("remove is not supported");
- }
-
- public List entries() {
- return entries;
- }
+
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java
index b9cd5f5..15e833f 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/FileDictionary.java
@@ -22,6 +22,8 @@ import java.io.*;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.TermFreqIterator;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
/**
@@ -36,7 +38,7 @@ public class FileDictionary implements Dictionary {
private BufferedReader in;
private String line;
- private boolean hasNextCalled;
+ private boolean done = false;
public FileDictionary(InputStream dictFile) {
in = new BufferedReader(new InputStreamReader(dictFile));
@@ -50,45 +52,39 @@ public class FileDictionary implements Dictionary {
}
public TermFreqIterator getWordsIterator() {
- return new fileIterator();
+ return new FileIterator();
}
- final class fileIterator implements TermFreqIterator {
+ final class FileIterator implements TermFreqIterator {
private float curFreq;
+ private final BytesRef spare = new BytesRef();
- public String next() {
- if (!hasNextCalled) {
- hasNext();
- }
- hasNextCalled = false;
- return line;
- }
-
+
public float freq() {
return curFreq;
}
- public boolean hasNext() {
- hasNextCalled = true;
- try {
- line = in.readLine();
- if (line != null) {
- String[] fields = line.split("\t");
- if (fields.length > 1) {
- curFreq = Float.parseFloat(fields[1]);
- line = fields[0];
- } else {
- curFreq = 1;
- }
+ @Override
+ public BytesRef next() throws IOException {
+ if (done) {
+ return null;
+ }
+ line = in.readLine();
+ if (line != null) {
+ String[] fields = line.split("\t");
+ if (fields.length > 1) {
+ curFreq = Float.parseFloat(fields[1]);
+ spare.copyChars(fields[0]);
+ } else {
+ spare.copyChars(line);
+ curFreq = 1;
}
- } catch (IOException ex) {
- throw new RuntimeException(ex);
+ return spare;
+ } else {
+ done = true;
+ IOUtils.close(in);
+ return null;
}
- return (line != null) ? true : false;
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
}
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java
index ab20c01..1ab0eac 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/Lookup.java
@@ -19,11 +19,13 @@ package org.apache.lucene.search.suggest;
import java.io.File;
import java.io.IOException;
-import java.util.Iterator;
+import java.io.InputStream;
+import java.io.OutputStream;
import java.util.List;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.TermFreqIterator;
+import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.PriorityQueue;
public abstract class Lookup {
@@ -77,7 +79,7 @@ public abstract class Lookup {
* {@link UnsortedTermFreqIteratorWrapper} in such case.
*/
public void build(Dictionary dict) throws IOException {
- Iterator it = dict.getWordsIterator();
+ BytesRefIterator it = dict.getWordsIterator();
TermFreqIterator tfit;
if (it instanceof TermFreqIterator) {
tfit = (TermFreqIterator)it;
@@ -90,23 +92,6 @@ public abstract class Lookup {
public abstract void build(TermFreqIterator tfit) throws IOException;
/**
- * Persist the constructed lookup data to a directory. Optional operation.
- * @param storeDir directory where data can be stored.
- * @return true if successful, false if unsuccessful or not supported.
- * @throws IOException when fatal IO error occurs.
- */
- public abstract boolean store(File storeDir) throws IOException;
-
- /**
- * Discard current lookup data and load it from a previously saved copy.
- * Optional operation.
- * @param storeDir directory where lookup data was stored.
- * @return true if completed successfully, false if unsuccessful or not supported.
- * @throws IOException when fatal IO error occurs.
- */
- public abstract boolean load(File storeDir) throws IOException;
-
- /**
* Look up a key and return possible completion for this key.
* @param key lookup key. Depending on the implementation this may be
* a prefix, misspelling, or even infix.
@@ -114,6 +99,7 @@ public abstract class Lookup {
* @param num maximum number of results to return
* @return a list of possible completions, with their relative weight (e.g. popularity)
*/
+ // TODO: this should be a BytesRef API?
public abstract List lookup(String key, boolean onlyMorePopular, int num);
/**
@@ -123,6 +109,7 @@ public abstract class Lookup {
* @return true if new key is added, false if it already exists or operation
* is not supported.
*/
+ // TODO: this should be a BytesRef API?
public abstract boolean add(String key, Object value);
/**
@@ -130,5 +117,40 @@ public abstract class Lookup {
* @param key lookup key
* @return associated value
*/
- public abstract Object get(String key);
+ // TODO: this should be a BytesRef API?
+ public abstract Object get(String key);
+
+ /**
+ * Persist the constructed lookup data to a directory. Optional operation.
+ * @param output {@link OutputStream} to write the data to.
+ * @return true if successful, false if unsuccessful or not supported.
+ * @throws IOException when fatal IO error occurs.
+ */
+ public abstract boolean store(OutputStream output) throws IOException;
+
+ /**
+ * Discard current lookup data and load it from a previously saved copy.
+ * Optional operation.
+ * @param input the {@link InputStream} to load the lookup data.
+ * @return true if completed successfully, false if unsuccessful or not supported.
+ * @throws IOException when fatal IO error occurs.
+ */
+ public abstract boolean load(InputStream input) throws IOException;
+
+ /**
+ * Persist the constructed lookup data to a directory. Optional operation.
+ * @param storeDir directory where data can be stored.
+ * @return true if successful, false if unsuccessful or not supported.
+ * @throws IOException when fatal IO error occurs.
+ */
+ public abstract boolean store(File storeDir) throws IOException;
+
+ /**
+ * Discard current lookup data and load it from a previously saved copy.
+ * Optional operation.
+ * @param storeDir directory where lookup data was stored.
+ * @return true if completed successfully, false if unsuccessful or not supported.
+ * @throws IOException when fatal IO error occurs.
+ */
+ public abstract boolean load(File storeDir) throws IOException;
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java
index ddff06e..ffa4f9b 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/SortedTermFreqIteratorWrapper.java
@@ -17,10 +17,12 @@ package org.apache.lucene.search.suggest;
* limitations under the License.
*/
-import java.util.Collections;
+import java.io.IOException;
+import java.util.Comparator;
import org.apache.lucene.search.spell.SortedIterator;
import org.apache.lucene.search.spell.TermFreqIterator;
+import org.apache.lucene.util.BytesRef;
/**
* This wrapper buffers incoming elements and makes sure they are sorted in
@@ -28,8 +30,35 @@ import org.apache.lucene.search.spell.TermFreqIterator;
*/
public class SortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper implements SortedIterator {
- public SortedTermFreqIteratorWrapper(TermFreqIterator source) {
+ private final int[] sortedOrds;
+ private int currentOrd = -1;
+ private final BytesRef spare = new BytesRef();
+ private final Comparator comp;
+
+
+ public SortedTermFreqIteratorWrapper(TermFreqIterator source, Comparator comp) throws IOException {
super(source);
- Collections.sort(entries);
+ this.sortedOrds = entries.sort(comp);
+ this.comp = comp;
+ }
+
+ @Override
+ public float freq() {
+ return freqs[currentOrd];
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+ if (++curPos < entries.size()) {
+ return entries.get(spare, (currentOrd = sortedOrds[curPos]));
+ }
+ return null;
+ }
+
+ @Override
+ public Comparator comparator() {
+ return comp;
}
+
+
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java
index d7b5b6e..d7b1b60 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/UnsortedTermFreqIteratorWrapper.java
@@ -17,9 +17,11 @@ package org.apache.lucene.search.suggest;
* limitations under the License.
*/
-import java.util.Collections;
+import java.io.IOException;
+import java.util.Random;
import org.apache.lucene.search.spell.TermFreqIterator;
+import org.apache.lucene.util.BytesRef;
/**
* This wrapper buffers the incoming elements and makes sure they are in
@@ -27,8 +29,34 @@ import org.apache.lucene.search.spell.TermFreqIterator;
*/
public class UnsortedTermFreqIteratorWrapper extends BufferingTermFreqIteratorWrapper {
- public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) {
+ private final int[] ords;
+ private int currentOrd = -1;
+ private final BytesRef spare = new BytesRef();
+ public UnsortedTermFreqIteratorWrapper(TermFreqIterator source) throws IOException {
super(source);
- Collections.shuffle(entries);
+ ords = new int[entries.size()];
+ Random random = new Random();
+ for (int i = 0; i < ords.length; i++) {
+ ords[i] = i;
+ }
+ for (int i = 0; i < ords.length; i++) {
+ int randomPosition = random.nextInt(ords.length);
+ int temp = ords[i];
+ ords[i] = ords[randomPosition];
+ ords[randomPosition] = temp;
+ }
+ }
+
+ @Override
+ public float freq() {
+ return freqs[currentOrd];
+ }
+
+ @Override
+ public BytesRef next() throws IOException {
+ if (++curPos < entries.size()) {
+ return entries.get(spare, (currentOrd = ords[curPos]));
+ }
+ return null;
}
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
index c6db1a8..4de0d00 100644
--- a/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
+++ b/modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java
@@ -19,6 +19,8 @@ package org.apache.lucene.search.suggest.fst;
import java.io.File;
import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
@@ -29,6 +31,8 @@ import org.apache.lucene.search.suggest.fst.Sort.SortInfo;
import org.apache.lucene.search.suggest.tst.TSTLookup;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.*;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.NoOutputs;
@@ -158,20 +162,17 @@ public class FSTCompletionLookup extends Lookup {
// If negative floats are allowed some trickery needs to be done to find their byte order.
boolean success = false;
try {
- BytesRef tmp1 = new BytesRef();
byte [] buffer = new byte [0];
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
- while (tfit.hasNext()) {
- String key = tfit.next();
- UnicodeUtil.UTF16toUTF8(key, 0, key.length(), tmp1);
-
- if (tmp1.length + 4 >= buffer.length) {
- buffer = ArrayUtil.grow(buffer, tmp1.length + 4);
+ BytesRef spare;
+ while ((spare = tfit.next()) != null) {
+ if (spare.length + 4 >= buffer.length) {
+ buffer = ArrayUtil.grow(buffer, spare.length + 4);
}
output.reset(buffer);
output.writeInt(FloatMagic.toSortable(tfit.freq()));
- output.writeBytes(tmp1.bytes, tmp1.offset, tmp1.length);
+ output.writeBytes(spare.bytes, spare.offset, spare.length);
writer.write(buffer, 0, output.getPosition());
}
writer.close();
@@ -189,6 +190,7 @@ public class FSTCompletionLookup extends Lookup {
int previousBucket = 0;
float previousScore = 0;
ByteArrayDataInput input = new ByteArrayDataInput();
+ BytesRef tmp1 = new BytesRef();
BytesRef tmp2 = new BytesRef();
while (reader.read(tmp1)) {
input.reset(tmp1.bytes);
@@ -293,4 +295,30 @@ public class FSTCompletionLookup extends Lookup {
normalCompletion.getFST().save(new File(storeDir, FILENAME));
return true;
}
+
+ @Override
+ public synchronized boolean store(OutputStream output) throws IOException {
+
+ if (this.normalCompletion == null)
+ return false;
+ try {
+ normalCompletion.getFST().save(new OutputStreamDataOutput(output));
+ } finally {
+ IOUtils.close(output);
+ }
+ return true;
+ }
+
+ @Override
+ public synchronized boolean load(InputStream input) throws IOException {
+ try {
+ this.higherWeightsCompletion = new FSTCompletion(new FST