Index: modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java =================================================================== --- modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java (revision 1231386) +++ modules/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionBuilder.java (working copy) @@ -5,6 +5,7 @@ import java.util.Iterator; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.*; /** @@ -219,11 +220,12 @@ shareMaxTailLength, outputs, null); BytesRef scratch = new BytesRef(); + final IntsRef scratchIntsRef = new IntsRef(); int count = 0; for (Iterator i = sorter.iterator(); i.hasNext(); count++) { BytesRef entry = i.next(); if (scratch.compareTo(entry) != 0) { - builder.add(entry, empty); + builder.add(Util.toIntsRef(entry, scratchIntsRef), empty); scratch.copyBytes(entry); } } Index: modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java (revision 1231386) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java (working copy) @@ -33,9 +33,11 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Util; /** * A map of synonyms, keys and values are phrases. @@ -262,6 +264,8 @@ Set keys = workingSet.keySet(); CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]); Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator()); + + final IntsRef scratchIntsRef = new IntsRef(); //System.out.println("fmap.build"); for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) { @@ -307,7 +311,7 @@ scratch.length = scratchOutput.getPosition() - scratch.offset; //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); - builder.add(input, BytesRef.deepCopyOf(scratch)); + builder.add(Util.toUTF32(input, scratchIntsRef), BytesRef.deepCopyOf(scratch)); } FST fst = builder.finish(); Index: lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java =================================================================== --- lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (revision 1231386) +++ lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java (working copy) @@ -1050,6 +1050,7 @@ } Terms terms = MultiFields.getTerms(r, "body"); if (terms != null) { + final IntsRef scratchIntsRef = new IntsRef(); final TermsEnum termsEnum = terms.iterator(null); if (VERBOSE) { System.out.println("TEST: got termsEnum=" + termsEnum); @@ -1073,7 +1074,7 @@ } else { output = termsEnum.docFreq(); } - builder.add(term, outputs.get(output)); + builder.add(Util.toIntsRef(term, scratchIntsRef), outputs.get(output)); ord++; if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { System.out.println(ord + " terms..."); @@ -1373,7 +1374,7 @@ public void testSingleString() throws Exception { final Outputs outputs = NoOutputs.getSingleton(); final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - b.add(new BytesRef("foobar"), outputs.getNoOutput()); + b.add(Util.toIntsRef(new BytesRef("foobar"), new IntsRef()), outputs.getNoOutput()); final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(b.finish()); assertNull(fstEnum.seekFloor(new BytesRef("foo"))); assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); @@ -1395,9 +1396,9 @@ final BytesRef b = new BytesRef("b"); final BytesRef c = new BytesRef("c"); - builder.add(a, outputs.get(17)); - builder.add(b, outputs.get(42)); - builder.add(c, outputs.get(13824324872317238L)); + builder.add(Util.toIntsRef(a, new IntsRef()), outputs.get(17)); + builder.add(Util.toIntsRef(b, new IntsRef()), outputs.get(42)); + builder.add(Util.toIntsRef(c, new IntsRef()), outputs.get(13824324872317238L)); final FST fst = builder.finish(); @@ -1628,13 +1629,14 @@ int line = 0; final BytesRef term = new BytesRef(); + final IntsRef scratchIntsRef = new IntsRef(); while (line < lines.length) { String w = lines[line++]; if (w == null) { break; } term.copyChars(w); - b.add(term, nothing); + b.add(Util.toIntsRef(term, scratchIntsRef), nothing); } return b.finish(); @@ -1698,8 +1700,8 @@ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final Builder builder = new Builder(FST.INPUT_TYPE.BYTE4, 2, 0, true, true, Integer.MAX_VALUE, outputs, null); - builder.add("stat", outputs.get(17)); - builder.add("station", outputs.get(10)); + builder.add(Util.toUTF32("stat", new IntsRef()), outputs.get(17)); + builder.add(Util.toUTF32("station", new IntsRef()), outputs.get(10)); final FST fst = builder.finish(); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); StringWriter w = new StringWriter(); @@ -1713,8 +1715,8 @@ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, null); - builder.add(new BytesRef("stat"), outputs.getNoOutput()); - builder.add(new BytesRef("station"), outputs.getNoOutput()); + builder.add(Util.toIntsRef(new BytesRef("stat"), new IntsRef()), outputs.getNoOutput()); + builder.add(Util.toIntsRef(new BytesRef("station"), new IntsRef()), outputs.getNoOutput()); final FST fst = builder.finish(); StringWriter w = new StringWriter(); //Writer w = new OutputStreamWriter(new FileOutputStream("/x/tmp/out.dot")); Index: lucene/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (revision 1231386) +++ lucene/src/java/org/apache/lucene/codecs/memory/MemoryPostingsFormat.java (working copy) @@ -51,10 +51,12 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.ByteSequenceOutputs; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Util; // TODO: would be nice to somehow allow this to act like // InstantiatedIndex, by never writing to disk; ie you write @@ -183,6 +185,8 @@ private final BytesRef spare = new BytesRef(); private byte[] finalBuffer = new byte[128]; + private final IntsRef scratchIntsRef = new IntsRef(); + @Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { @@ -213,7 +217,7 @@ System.out.println(" " + Integer.toHexString(finalBuffer[i]&0xFF)); } } - builder.add(text, BytesRef.deepCopyOf(spare)); + builder.add(Util.toIntsRef(text, scratchIntsRef), BytesRef.deepCopyOf(spare)); termCount++; } Index: lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexReader.java (revision 1231386) +++ lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexReader.java (working copy) @@ -33,6 +33,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; @@ -187,6 +188,7 @@ if (indexDivisor > 1) { // subsample + final IntsRef scratchIntsRef = new IntsRef(); final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); @@ -194,7 +196,7 @@ int count = indexDivisor; while((result = fstEnum.next()) != null) { if (count == indexDivisor) { - builder.add(result.input, result.output); + builder.add(Util.toIntsRef(result.input, scratchIntsRef), result.output); count = 0; } count++; Index: lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java (revision 1231386) +++ lucene/src/java/org/apache/lucene/codecs/VariableGapTermsIndexWriter.java (working copy) @@ -29,9 +29,11 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; /** * Selects index terms according to provided pluggable @@ -227,7 +229,7 @@ ////System.out.println("VGW: field=" + fieldInfo.name); // Always put empty string in - fstBuilder.add(new BytesRef(), fstOutputs.get(termsFilePointer)); + fstBuilder.add(new IntsRef(), fstOutputs.get(termsFilePointer)); startTermsFilePointer = termsFilePointer; } @@ -246,6 +248,8 @@ } } + private final IntsRef scratchIntsRef = new IntsRef(); + @Override public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException { if (text.length == 0) { @@ -256,7 +260,7 @@ final int lengthSave = text.length; text.length = indexedTermPrefixLength(lastTerm, text); try { - fstBuilder.add(text, fstOutputs.get(termsFilePointer)); + fstBuilder.add(Util.toIntsRef(text, scratchIntsRef), fstOutputs.get(termsFilePointer)); } finally { text.length = lengthSave; } Index: lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java (revision 1231386) +++ lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldsReader.java (working copy) @@ -36,6 +36,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.UnicodeUtil; @@ -44,6 +45,7 @@ import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PairOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; class SimpleTextFieldsReader extends FieldsProducer { @@ -477,11 +479,12 @@ int docFreq = 0; long totalTermFreq = 0; OpenBitSet visitedDocs = new OpenBitSet(); + final IntsRef scratchIntsRef = new IntsRef(); while(true) { SimpleTextUtil.readLine(in, scratch); if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) { if (lastDocsStart != -1) { - b.add(lastTerm, new PairOutputs.Pair>(lastDocsStart, + b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair>(lastDocsStart, new PairOutputs.Pair((long) docFreq, posIntOutputs.get(totalTermFreq)))); sumTotalTermFreq += totalTermFreq; @@ -497,7 +500,7 @@ totalTermFreq++; } else if (StringHelper.startsWith(scratch, TERM)) { if (lastDocsStart != -1) { - b.add(lastTerm, new PairOutputs.Pair>(lastDocsStart, + b.add(Util.toIntsRef(lastTerm, scratchIntsRef), new PairOutputs.Pair>(lastDocsStart, new PairOutputs.Pair((long) docFreq, posIntOutputs.get(totalTermFreq)))); } Index: lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java =================================================================== --- lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (revision 1231386) +++ lucene/src/java/org/apache/lucene/codecs/BlockTreeTermsWriter.java (working copy) @@ -22,8 +22,8 @@ import java.util.Comparator; import java.util.List; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; @@ -39,6 +39,7 @@ import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.NoOutputs; +import org.apache.lucene.util.fst.Util; /* TODO: @@ -244,6 +245,7 @@ public final boolean hasTerms; public final boolean isFloor; public final int floorLeadByte; + private final IntsRef scratchIntsRef = new IntsRef(); public PendingBlock(BytesRef prefix, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List> subIndices) { super(false); @@ -294,7 +296,7 @@ final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()]; assert bytes.length > 0; scratchBytes.writeTo(bytes, 0); - indexBuilder.add(prefix, new BytesRef(bytes, 0, bytes.length)); + indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), new BytesRef(bytes, 0, bytes.length)); scratchBytes.reset(); // Copy over index for all sub-blocks @@ -337,7 +339,7 @@ //if (DEBUG) { // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output); //} - builder.add(indexEnt.input, indexEnt.output); + builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output); } } } @@ -853,13 +855,15 @@ return postingsWriter; } + private final IntsRef scratchIntsRef = new IntsRef(); + @Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { assert stats.docFreq > 0; //if (DEBUG) System.out.println("BTTW.finishTerm term=" + fieldInfo.name + ":" + toString(text) + " seg=" + segment + " df=" + stats.docFreq); - blockBuilder.add(text, noOutputs.getNoOutput()); + blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput()); pending.add(new PendingTerm(BytesRef.deepCopyOf(text), stats)); postingsWriter.finishTerm(stats); numTerms++; Index: lucene/src/java/org/apache/lucene/util/fst/Util.java =================================================================== --- lucene/src/java/org/apache/lucene/util/fst/Util.java (revision 1231386) +++ lucene/src/java/org/apache/lucene/util/fst/Util.java (working copy) @@ -381,4 +381,51 @@ return "0x" + Integer.toHexString(label); } } + + /** Decodes the Unicode codepoints from the provided + * CharSequence and places them in the provided scratch + * IntsRef, which must not be null, returning it. */ + public static IntsRef toUTF32(CharSequence s, IntsRef scratch) { + int charIdx = 0; + int intIdx = 0; + final int charLimit = s.length(); + while(charIdx < charLimit) { + scratch.grow(intIdx+1); + final int utf32 = Character.codePointAt(s, charIdx); + scratch.ints[intIdx] = utf32; + charIdx += Character.charCount(utf32); + intIdx++; + } + scratch.length = intIdx; + return scratch; + } + + /** Decodes the Unicode codepoints from the provided + * CharSequence and places them in the provided scratch + * IntsRef, which must not be null, returning it. */ + public static IntsRef toUTF32(char[] s, int offset, int length, IntsRef scratch) { + int charIdx = offset; + int intIdx = 0; + final int charLimit = offset + length; + while(charIdx < charLimit) { + scratch.grow(intIdx+1); + final int utf32 = Character.codePointAt(s, charIdx); + scratch.ints[intIdx] = utf32; + charIdx += Character.charCount(utf32); + intIdx++; + } + scratch.length = intIdx; + return scratch; + } + + /** Just takes unsigned byte values from the BytesRef and + * converts into an IntsRef. */ + public static IntsRef toIntsRef(BytesRef input, IntsRef scratch) { + scratch.grow(input.length); + for(int i=0;i