Index: src/test/org/apache/lucene/index/DocHelper.java =================================================================== --- src/test/org/apache/lucene/index/DocHelper.java (revision 558686) +++ src/test/org/apache/lucene/index/DocHelper.java (working copy) @@ -207,57 +207,40 @@ } /** - * Writes the document to the directory using a segment named "test" + * Writes the document to the directory using a segment + * named "test"; returns the SegmentInfo describing the new + * segment * @param dir * @param doc * @throws IOException */ - public static void writeDoc(Directory dir, Document doc) throws IOException + public static SegmentInfo writeDoc(Directory dir, Document doc) throws IOException { - writeDoc(dir, "test", doc); + return writeDoc(dir, new WhitespaceAnalyzer(), Similarity.getDefault(), doc); } /** - * Writes the document to the directory in the given segment + * Writes the document to the directory using the analyzer + * and the similarity score; returns the SegmentInfo + * describing the new segment * @param dir - * @param segment - * @param doc - * @throws IOException - */ - public static void writeDoc(Directory dir, String segment, Document doc) throws IOException - { - Similarity similarity = Similarity.getDefault(); - writeDoc(dir, new WhitespaceAnalyzer(), similarity, segment, doc); - } - - /** - * Writes the document to the directory segment named "test" using the specified analyzer and similarity - * @param dir * @param analyzer * @param similarity * @param doc * @throws IOException */ - public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc) throws IOException + public static SegmentInfo writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc) throws IOException { - writeDoc(dir, analyzer, similarity, "test", doc); + IndexWriter writer = new IndexWriter(dir, analyzer); + writer.setSimilarity(similarity); + //writer.setUseCompoundFile(false); + writer.addDocument(doc); + writer.flush(); + SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1); + writer.close(); + return info; } - /** - * Writes the document to the directory segment using the analyzer and the similarity score - * @param dir - * @param analyzer - * @param similarity - * @param segment - * @param doc - * @throws IOException - */ - public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc) throws IOException - { - DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50); - writer.addDocument(segment, doc); - } - public static int numFields(Document doc) { return doc.getFields().size(); } Index: src/test/org/apache/lucene/index/TestDoc.java =================================================================== --- src/test/org/apache/lucene/index/TestDoc.java (revision 558686) +++ src/test/org/apache/lucene/index/TestDoc.java (working copy) @@ -105,14 +105,16 @@ StringWriter sw = new StringWriter(); PrintWriter out = new PrintWriter(sw, true); - Directory directory = FSDirectory.getDirectory(indexDir, true); - directory.close(); + Directory directory = FSDirectory.getDirectory(indexDir); + IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true); - SegmentInfo si1 = indexDoc("one", "test.txt"); + SegmentInfo si1 = indexDoc(writer, "test.txt"); printSegment(out, si1); - SegmentInfo si2 = indexDoc("two", "test2.txt"); + SegmentInfo si2 = indexDoc(writer, "test2.txt"); printSegment(out, si2); + writer.close(); + directory.close(); SegmentInfo siMerge = merge(si1, si2, "merge", false); printSegment(out, siMerge); @@ -131,14 +133,16 @@ sw = new StringWriter(); out = new PrintWriter(sw, true); - directory = FSDirectory.getDirectory(indexDir, true); - directory.close(); + directory = FSDirectory.getDirectory(indexDir); + writer = new IndexWriter(directory, new SimpleAnalyzer(), true); - si1 = indexDoc("one", "test.txt"); + si1 = indexDoc(writer, "test.txt"); printSegment(out, si1); - si2 = indexDoc("two", "test2.txt"); + si2 = indexDoc(writer, "test2.txt"); printSegment(out, si2); + writer.close(); + directory.close(); siMerge = merge(si1, si2, "merge", true); printSegment(out, siMerge); @@ -157,21 +161,14 @@ } - private SegmentInfo indexDoc(String segment, String fileName) + private SegmentInfo indexDoc(IndexWriter writer, String fileName) throws Exception { - Directory directory = FSDirectory.getDirectory(indexDir, false); - Analyzer analyzer = new SimpleAnalyzer(); - DocumentWriter writer = - new DocumentWriter(directory, analyzer, Similarity.getDefault(), 1000); - File file = new File(workDir, fileName); Document doc = FileDocument.Document(file); - - writer.addDocument(segment, doc); - - directory.close(); - return new SegmentInfo(segment, 1, directory, false, false); + writer.addDocument(doc); + writer.flush(); + return writer.segmentInfos.info(writer.segmentInfos.size()-1); } Index: src/test/org/apache/lucene/index/TestMultiReader.java =================================================================== --- src/test/org/apache/lucene/index/TestMultiReader.java (revision 558686) +++ src/test/org/apache/lucene/index/TestMultiReader.java (working copy) @@ -43,15 +43,20 @@ protected void setUp() throws IOException { DocHelper.setupDoc(doc1); DocHelper.setupDoc(doc2); - DocHelper.writeDoc(dir, "seg-1", doc1); - DocHelper.writeDoc(dir, "seg-2", doc2); + SegmentInfo info1 = DocHelper.writeDoc(dir, doc1); + SegmentInfo info2 = DocHelper.writeDoc(dir, doc2); sis.write(dir); - reader1 = SegmentReader.get(new SegmentInfo("seg-1", 1, dir)); - reader2 = SegmentReader.get(new SegmentInfo("seg-2", 1, dir)); + openReaders(); + } + + private void openReaders() throws IOException { + sis.read(dir); + reader1 = SegmentReader.get(sis.info(0)); + reader2 = SegmentReader.get(sis.info(1)); readers[0] = reader1; readers[1] = reader2; } - + public void test() { assertTrue(dir != null); assertTrue(reader1 != null); @@ -87,7 +92,9 @@ // Ensure undeleteAll survives commit/close/reopen: reader.commit(); reader.close(); + sis.read(dir); + openReaders(); reader = new MultiReader(dir, sis, false, readers); assertEquals( 2, reader.numDocs() ); Index: src/test/org/apache/lucene/index/TestFieldsReader.java =================================================================== --- src/test/org/apache/lucene/index/TestFieldsReader.java (revision 558686) +++ src/test/org/apache/lucene/index/TestFieldsReader.java (working copy) @@ -35,6 +35,8 @@ private Document testDoc = new Document(); private FieldInfos fieldInfos = null; + private final static String TEST_SEGMENT_NAME = "_0"; + public TestFieldsReader(String s) { super(s); } @@ -43,16 +45,16 @@ fieldInfos = new FieldInfos(); DocHelper.setupDoc(testDoc); fieldInfos.add(testDoc); - DocumentWriter writer = new DocumentWriter(dir, new WhitespaceAnalyzer(), - Similarity.getDefault(), 50); - assertTrue(writer != null); - writer.addDocument("test", testDoc); + IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); + writer.setUseCompoundFile(false); + writer.addDocument(testDoc); + writer.close(); } public void test() throws IOException { assertTrue(dir != null); assertTrue(fieldInfos != null); - FieldsReader reader = new FieldsReader(dir, "test", fieldInfos); + FieldsReader reader = new FieldsReader(dir, TEST_SEGMENT_NAME, fieldInfos); assertTrue(reader != null); assertTrue(reader.size() == 1); Document doc = reader.doc(0, null); @@ -82,7 +84,7 @@ public void testLazyFields() throws Exception { assertTrue(dir != null); assertTrue(fieldInfos != null); - FieldsReader reader = new FieldsReader(dir, "test", fieldInfos); + FieldsReader reader = new FieldsReader(dir, TEST_SEGMENT_NAME, fieldInfos); assertTrue(reader != null); assertTrue(reader.size() == 1); Set loadFieldNames = new HashSet(); @@ -137,7 +139,7 @@ public void testLazyFieldsAfterClose() throws Exception { assertTrue(dir != null); assertTrue(fieldInfos != null); - FieldsReader reader = new FieldsReader(dir, "test", fieldInfos); + FieldsReader reader = new FieldsReader(dir, TEST_SEGMENT_NAME, fieldInfos); assertTrue(reader != null); assertTrue(reader.size() == 1); Set loadFieldNames = new HashSet(); @@ -167,7 +169,7 @@ public void testLoadFirst() throws Exception { assertTrue(dir != null); assertTrue(fieldInfos != null); - FieldsReader reader = new FieldsReader(dir, "test", fieldInfos); + FieldsReader reader = new FieldsReader(dir, TEST_SEGMENT_NAME, fieldInfos); assertTrue(reader != null); assertTrue(reader.size() == 1); LoadFirstFieldSelector fieldSelector = new LoadFirstFieldSelector(); @@ -200,10 +202,12 @@ _TestUtil.rmDir(file); FSDirectory tmpDir = FSDirectory.getDirectory(file); assertTrue(tmpDir != null); - DocumentWriter writer = new DocumentWriter(tmpDir, new WhitespaceAnalyzer(), - Similarity.getDefault(), 50); - assertTrue(writer != null); - writer.addDocument("test", testDoc); + + IndexWriter writer = new IndexWriter(tmpDir, new WhitespaceAnalyzer(), true); + writer.setUseCompoundFile(false); + writer.addDocument(testDoc); + writer.close(); + assertTrue(fieldInfos != null); FieldsReader reader; long lazyTime = 0; @@ -214,7 +218,7 @@ SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.EMPTY_SET, lazyFieldNames); for (int i = 0; i < length; i++) { - reader = new FieldsReader(tmpDir, "test", fieldInfos); + reader = new FieldsReader(tmpDir, TEST_SEGMENT_NAME, fieldInfos); assertTrue(reader != null); assertTrue(reader.size() == 1); @@ -238,7 +242,7 @@ doc = null; //Hmmm, are we still in cache??? System.gc(); - reader = new FieldsReader(tmpDir, "test", fieldInfos); + reader = new FieldsReader(tmpDir, TEST_SEGMENT_NAME, fieldInfos); doc = reader.doc(0, fieldSelector); field = doc.getFieldable(DocHelper.LARGE_LAZY_FIELD_KEY); assertTrue("field is not lazy", field.isLazy() == true); @@ -256,7 +260,7 @@ } public void testLoadSize() throws IOException { - FieldsReader reader = new FieldsReader(dir, "test", fieldInfos); + FieldsReader reader = new FieldsReader(dir, TEST_SEGMENT_NAME, fieldInfos); Document doc; doc = reader.doc(0, new FieldSelector(){ Index: src/test/org/apache/lucene/index/TestSegmentTermDocs.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentTermDocs.java (revision 558686) +++ src/test/org/apache/lucene/index/TestSegmentTermDocs.java (working copy) @@ -29,6 +29,7 @@ public class TestSegmentTermDocs extends TestCase { private Document testDoc = new Document(); private Directory dir = new RAMDirectory(); + private SegmentInfo info; public TestSegmentTermDocs(String s) { super(s); @@ -36,7 +37,7 @@ protected void setUp() throws IOException { DocHelper.setupDoc(testDoc); - DocHelper.writeDoc(dir, testDoc); + info = DocHelper.writeDoc(dir, testDoc); } @@ -50,7 +51,7 @@ public void testTermDocs() throws IOException { //After adding the document, we should be able to read it back in - SegmentReader reader = SegmentReader.get(new SegmentInfo("test", 1, dir)); + SegmentReader reader = SegmentReader.get(info); assertTrue(reader != null); SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); assertTrue(segTermDocs != null); @@ -68,7 +69,7 @@ public void testBadSeek() throws IOException { { //After adding the document, we should be able to read it back in - SegmentReader reader = SegmentReader.get(new SegmentInfo("test", 1, dir)); + SegmentReader reader = SegmentReader.get(info); assertTrue(reader != null); SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); assertTrue(segTermDocs != null); @@ -78,7 +79,7 @@ } { //After adding the document, we should be able to read it back in - SegmentReader reader = SegmentReader.get(new SegmentInfo("test", 1, dir)); + SegmentReader reader = SegmentReader.get(info); assertTrue(reader != null); SegmentTermDocs segTermDocs = new SegmentTermDocs(reader); assertTrue(segTermDocs != null); Index: src/test/org/apache/lucene/index/TestSegmentMerger.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentMerger.java (revision 558686) +++ src/test/org/apache/lucene/index/TestSegmentMerger.java (working copy) @@ -32,12 +32,10 @@ //First segment to be merged private Directory merge1Dir = new RAMDirectory(); private Document doc1 = new Document(); - private String merge1Segment = "test-1"; private SegmentReader reader1 = null; //Second Segment to be merged private Directory merge2Dir = new RAMDirectory(); private Document doc2 = new Document(); - private String merge2Segment = "test-2"; private SegmentReader reader2 = null; @@ -47,11 +45,11 @@ protected void setUp() throws IOException { DocHelper.setupDoc(doc1); - DocHelper.writeDoc(merge1Dir, merge1Segment, doc1); + SegmentInfo info1 = DocHelper.writeDoc(merge1Dir, doc1); DocHelper.setupDoc(doc2); - DocHelper.writeDoc(merge2Dir, merge2Segment, doc2); - reader1 = SegmentReader.get(new SegmentInfo(merge1Segment, 1, merge1Dir)); - reader2 = SegmentReader.get(new SegmentInfo(merge2Segment, 1, merge2Dir)); + SegmentInfo info2 = DocHelper.writeDoc(merge2Dir, doc2); + reader1 = SegmentReader.get(info1); + reader2 = SegmentReader.get(info2); } public void test() { Index: src/test/org/apache/lucene/index/TestDocumentWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 558686) +++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy) @@ -32,6 +32,8 @@ import java.io.Reader; import java.io.IOException; +import java.util.Arrays; + public class TestDocumentWriter extends TestCase { private RAMDirectory dir; @@ -57,11 +59,13 @@ DocHelper.setupDoc(testDoc); Analyzer analyzer = new WhitespaceAnalyzer(); Similarity similarity = Similarity.getDefault(); - DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50); - String segName = "test"; - writer.addDocument(segName, testDoc); + IndexWriter writer = new IndexWriter(dir, analyzer, true); + writer.addDocument(testDoc); + writer.flush(); + SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1); + writer.close(); //After adding the document, we should be able to read it back in - SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir)); + SegmentReader reader = SegmentReader.get(info); assertTrue(reader != null); Document doc = reader.document(0); assertTrue(doc != null); @@ -89,14 +93,14 @@ assertTrue(fields != null && fields.length == 1); assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT)); - // test that the norm file is not present if omitNorms is true + // test that the norms are not present in the segment if + // omitNorms is true for (int i = 0; i < reader.fieldInfos.size(); i++) { FieldInfo fi = reader.fieldInfos.fieldInfo(i); if (fi.isIndexed) { - assertTrue(fi.omitNorms == !dir.fileExists(segName + ".f" + i)); + assertTrue(fi.omitNorms == !reader.hasNorms(fi.name)); } } - } public void testPositionIncrementGap() throws IOException { @@ -111,14 +115,17 @@ }; Similarity similarity = Similarity.getDefault(); - DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50); + IndexWriter writer = new IndexWriter(dir, analyzer, true); + Document doc = new Document(); doc.add(new Field("repeated", "repeated one", Field.Store.YES, Field.Index.TOKENIZED)); doc.add(new Field("repeated", "repeated two", Field.Store.YES, Field.Index.TOKENIZED)); - String segName = "test"; - writer.addDocument(segName, doc); - SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir)); + writer.addDocument(doc); + writer.flush(); + SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1); + writer.close(); + SegmentReader reader = SegmentReader.get(info); TermPositions termPositions = reader.termPositions(new Term("repeated", "repeated")); assertTrue(termPositions.next()); @@ -130,7 +137,7 @@ public void testPreAnalyzedField() throws IOException { Similarity similarity = Similarity.getDefault(); - DocumentWriter writer = new DocumentWriter(dir, new SimpleAnalyzer(), similarity, 50); + IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true); Document doc = new Document(); doc.add(new Field("preanalyzed", new TokenStream() { @@ -147,9 +154,11 @@ }, TermVector.NO)); - String segName = "test"; - writer.addDocument(segName, doc); - SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir)); + writer.addDocument(doc); + writer.flush(); + SegmentInfo info = writer.segmentInfos.info(writer.segmentInfos.size()-1); + writer.close(); + SegmentReader reader = SegmentReader.get(info); TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1")); assertTrue(termPositions.next()); Index: src/test/org/apache/lucene/index/TestSegmentReader.java =================================================================== --- src/test/org/apache/lucene/index/TestSegmentReader.java (revision 558686) +++ src/test/org/apache/lucene/index/TestSegmentReader.java (working copy) @@ -41,8 +41,8 @@ //TODO: Setup the reader w/ multiple documents protected void setUp() throws IOException { DocHelper.setupDoc(testDoc); - DocHelper.writeDoc(dir, testDoc); - reader = SegmentReader.get(new SegmentInfo("test", 1, dir)); + SegmentInfo info = DocHelper.writeDoc(dir, testDoc); + reader = SegmentReader.get(info); } protected void tearDown() { @@ -75,8 +75,8 @@ public void testDelete() throws IOException { Document docToDelete = new Document(); DocHelper.setupDoc(docToDelete); - DocHelper.writeDoc(dir, "seg-to-delete", docToDelete); - SegmentReader deleteReader = SegmentReader.get(new SegmentInfo("seg-to-delete", 1, dir)); + SegmentInfo info = DocHelper.writeDoc(dir, docToDelete); + SegmentReader deleteReader = SegmentReader.get(info); assertTrue(deleteReader != null); assertTrue(deleteReader.numDocs() == 1); deleteReader.deleteDocument(0); Index: src/java/org/apache/lucene/index/DocumentWriter.java =================================================================== --- src/java/org/apache/lucene/index/DocumentWriter.java (revision 558686) +++ src/java/org/apache/lucene/index/DocumentWriter.java (working copy) @@ -1,556 +0,0 @@ -package org.apache.lucene.index; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Fieldable; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.IndexOutput; - -import java.io.IOException; -import java.io.PrintStream; -import java.io.Reader; -import java.io.StringReader; -import java.util.Arrays; -import java.util.BitSet; -import java.util.Enumeration; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -final class DocumentWriter { - private Analyzer analyzer; - private Directory directory; - private Similarity similarity; - private FieldInfos fieldInfos; - private int maxFieldLength; - private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL; - private PrintStream infoStream; - - /** This ctor used by test code only. - * - * @param directory The directory to write the document information to - * @param analyzer The analyzer to use for the document - * @param similarity The Similarity function - * @param maxFieldLength The maximum number of tokens a field may have - */ - DocumentWriter(Directory directory, Analyzer analyzer, - Similarity similarity, int maxFieldLength) { - this.directory = directory; - this.analyzer = analyzer; - this.similarity = similarity; - this.maxFieldLength = maxFieldLength; - } - - DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer) { - this.directory = directory; - this.analyzer = analyzer; - this.similarity = writer.getSimilarity(); - this.maxFieldLength = writer.getMaxFieldLength(); - this.termIndexInterval = writer.getTermIndexInterval(); - } - - final void addDocument(String segment, Document doc) - throws CorruptIndexException, IOException { - // create field infos - fieldInfos = new FieldInfos(); - fieldInfos.add(doc); - - // invert doc into postingTable - postingTable.clear(); // clear postingTable - fieldLengths = new int[fieldInfos.size()]; // init fieldLengths - fieldPositions = new int[fieldInfos.size()]; // init fieldPositions - fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets - fieldStoresPayloads = new BitSet(fieldInfos.size()); - - fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts - Arrays.fill(fieldBoosts, doc.getBoost()); - - try { - - // Before we write the FieldInfos we invert the Document. The reason is that - // during invertion the TokenStreams of tokenized fields are being processed - // and we might encounter tokens that have payloads associated with them. In - // this case we have to update the FieldInfo of the particular field. - invertDocument(doc); - - // sort postingTable into an array - Posting[] postings = sortPostingTable(); - - // write field infos - fieldInfos.write(directory, segment + ".fnm"); - - // write field values - FieldsWriter fieldsWriter = - new FieldsWriter(directory, segment, fieldInfos); - try { - fieldsWriter.addDocument(doc); - } finally { - fieldsWriter.close(); - } - - /* - for (int i = 0; i < postings.length; i++) { - Posting posting = postings[i]; - System.out.print(posting.term); - System.out.print(" freq=" + posting.freq); - System.out.print(" pos="); - System.out.print(posting.positions[0]); - for (int j = 1; j < posting.freq; j++) - System.out.print("," + posting.positions[j]); - System.out.println(""); - } - */ - - // write postings - writePostings(postings, segment); - - // write norms of indexed fields - writeNorms(segment); - } finally { - // close TokenStreams - IOException ex = null; - - Iterator it = openTokenStreams.iterator(); - while (it.hasNext()) { - try { - ((TokenStream) it.next()).close(); - } catch (IOException e) { - if (ex != null) { - ex = e; - } - } - } - openTokenStreams.clear(); - - if (ex != null) { - throw ex; - } - } - } - - // Keys are Terms, values are Postings. - // Used to buffer a document before it is written to the index. - private final Hashtable postingTable = new Hashtable(); - private int[] fieldLengths; - private int[] fieldPositions; - private int[] fieldOffsets; - private float[] fieldBoosts; - - // If any of the tokens of a paticular field carry a payload - // then we enable payloads for that field. - private BitSet fieldStoresPayloads; - - // Keep references of the token streams. We must close them after - // the postings are written to the segment. - private List openTokenStreams = new LinkedList(); - - // Tokenizes the fields of a document into Postings. - private final void invertDocument(Document doc) - throws IOException { - Iterator fieldIterator = doc.getFields().iterator(); - while (fieldIterator.hasNext()) { - Fieldable field = (Fieldable) fieldIterator.next(); - String fieldName = field.name(); - int fieldNumber = fieldInfos.fieldNumber(fieldName); - - int length = fieldLengths[fieldNumber]; // length of field - int position = fieldPositions[fieldNumber]; // position in field - if (length>0) position+=analyzer.getPositionIncrementGap(fieldName); - int offset = fieldOffsets[fieldNumber]; // offset field - - if (field.isIndexed()) { - if (!field.isTokenized()) { // un-tokenized field - String stringValue = field.stringValue(); - if(field.isStoreOffsetWithTermVector()) - addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length())); - else - addPosition(fieldName, stringValue, position++, null, null); - offset += stringValue.length(); - length++; - } else - { // tokenized field - TokenStream stream = field.tokenStreamValue(); - - // the field does not have a TokenStream, - // so we have to obtain one from the analyzer - if (stream == null) { - Reader reader; // find or make Reader - if (field.readerValue() != null) - reader = field.readerValue(); - else if (field.stringValue() != null) - reader = new StringReader(field.stringValue()); - else - throw new IllegalArgumentException - ("field must have either String or Reader value"); - - // Tokenize field and add to postingTable - stream = analyzer.tokenStream(fieldName, reader); - } - - // remember this TokenStream, we must close it later - openTokenStreams.add(stream); - - // reset the TokenStream to the first token - stream.reset(); - - - Token lastToken = null; - for (Token t = stream.next(); t != null; t = stream.next()) { - position += (t.getPositionIncrement() - 1); - - Payload payload = t.getPayload(); - if (payload != null) { - // enable payloads for this field - fieldStoresPayloads.set(fieldNumber); - } - - TermVectorOffsetInfo termVectorOffsetInfo; - if (field.isStoreOffsetWithTermVector()) { - termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()); - } else { - termVectorOffsetInfo = null; - } - addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo); - - lastToken = t; - if (++length >= maxFieldLength) { - if (infoStream != null) - infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens"); - break; - } - } - - if(lastToken != null) - offset += lastToken.endOffset() + 1; - } - - fieldLengths[fieldNumber] = length; // save field length - fieldPositions[fieldNumber] = position; // save field position - fieldBoosts[fieldNumber] *= field.getBoost(); - fieldOffsets[fieldNumber] = offset; - } - } - - // update fieldInfos for all fields that have one or more tokens with payloads - for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) { - fieldInfos.fieldInfo(i).storePayloads = true; - } - } - - private final Term termBuffer = new Term("", ""); // avoid consing - - private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) { - termBuffer.set(field, text); - //System.out.println("Offset: " + offset); - Posting ti = (Posting) postingTable.get(termBuffer); - if (ti != null) { // word seen before - int freq = ti.freq; - if (ti.positions.length == freq) { // positions array is full - int[] newPositions = new int[freq * 2]; // double size - int[] positions = ti.positions; - System.arraycopy(positions, 0, newPositions, 0, freq); - ti.positions = newPositions; - - if (ti.payloads != null) { - // the current field stores payloads - Payload[] newPayloads = new Payload[freq * 2]; // grow payloads array - Payload[] payloads = ti.payloads; - System.arraycopy(payloads, 0, newPayloads, 0, payloads.length); - ti.payloads = newPayloads; - } - } - ti.positions[freq] = position; // add new position - - if (payload != null) { - if (ti.payloads == null) { - // lazily allocate payload array - ti.payloads = new Payload[ti.positions.length]; - } - ti.payloads[freq] = payload; - } - - if (offset != null) { - if (ti.offsets.length == freq){ - TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2]; - TermVectorOffsetInfo [] offsets = ti.offsets; - System.arraycopy(offsets, 0, newOffsets, 0, freq); - ti.offsets = newOffsets; - } - ti.offsets[freq] = offset; - } - ti.freq = freq + 1; // update frequency - } else { // word not seen before - Term term = new Term(field, text, false); - postingTable.put(term, new Posting(term, position, payload, offset)); - } - } - - private final Posting[] sortPostingTable() { - // copy postingTable into an array - Posting[] array = new Posting[postingTable.size()]; - Enumeration postings = postingTable.elements(); - for (int i = 0; postings.hasMoreElements(); i++) - array[i] = (Posting) postings.nextElement(); - - // sort the array - quickSort(array, 0, array.length - 1); - - return array; - } - - private static final void quickSort(Posting[] postings, int lo, int hi) { - if (lo >= hi) - return; - - int mid = (lo + hi) >>> 1; - - if (postings[lo].term.compareTo(postings[mid].term) > 0) { - Posting tmp = postings[lo]; - postings[lo] = postings[mid]; - postings[mid] = tmp; - } - - if (postings[mid].term.compareTo(postings[hi].term) > 0) { - Posting tmp = postings[mid]; - postings[mid] = postings[hi]; - postings[hi] = tmp; - - if (postings[lo].term.compareTo(postings[mid].term) > 0) { - Posting tmp2 = postings[lo]; - postings[lo] = postings[mid]; - postings[mid] = tmp2; - } - } - - int left = lo + 1; - int right = hi - 1; - - if (left >= right) - return; - - Term partition = postings[mid].term; - - for (; ;) { - while (postings[right].term.compareTo(partition) > 0) - --right; - - while (left < right && postings[left].term.compareTo(partition) <= 0) - ++left; - - if (left < right) { - Posting tmp = postings[left]; - postings[left] = postings[right]; - postings[right] = tmp; - --right; - } else { - break; - } - } - - quickSort(postings, lo, left); - quickSort(postings, left + 1, hi); - } - - private final void writePostings(Posting[] postings, String segment) - throws CorruptIndexException, IOException { - IndexOutput freq = null, prox = null; - TermInfosWriter tis = null; - TermVectorsWriter termVectorWriter = null; - try { - //open files for inverse index storage - freq = directory.createOutput(segment + ".frq"); - prox = directory.createOutput(segment + ".prx"); - tis = new TermInfosWriter(directory, segment, fieldInfos, - termIndexInterval); - TermInfo ti = new TermInfo(); - String currentField = null; - boolean currentFieldHasPayloads = false; - - for (int i = 0; i < postings.length; i++) { - Posting posting = postings[i]; - - // check to see if we switched to a new field - String termField = posting.term.field(); - if (currentField != termField) { - // changing field - see if there is something to save - currentField = termField; - FieldInfo fi = fieldInfos.fieldInfo(currentField); - currentFieldHasPayloads = fi.storePayloads; - if (fi.storeTermVector) { - if (termVectorWriter == null) { - termVectorWriter = - new TermVectorsWriter(directory, segment, fieldInfos); - termVectorWriter.openDocument(); - } - termVectorWriter.openField(currentField); - - } else if (termVectorWriter != null) { - termVectorWriter.closeField(); - } - } - - // add an entry to the dictionary with pointers to prox and freq files - ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1); - tis.add(posting.term, ti); - - // add an entry to the freq file - int postingFreq = posting.freq; - if (postingFreq == 1) // optimize freq=1 - freq.writeVInt(1); // set low bit of doc num. - else { - freq.writeVInt(0); // the document number - freq.writeVInt(postingFreq); // frequency in doc - } - - int lastPosition = 0; // write positions - int[] positions = posting.positions; - Payload[] payloads = posting.payloads; - int lastPayloadLength = -1; - - - // The following encoding is being used for positions and payloads: - // Case 1: current field does not store payloads - // Positions -> ^freq - // PositionDelta -> VInt - // The PositionDelta is the difference between the current - // and the previous position - // Case 2: current field stores payloads - // Positions -> ^freq - // Payload -> - // PositionDelta -> VInt - // PayloadLength -> VInt - // PayloadData -> byte^PayloadLength - // In this case PositionDelta/2 is the difference between - // the current and the previous position. If PositionDelta - // is odd, then a PayloadLength encoded as VInt follows, - // if PositionDelta is even, then it is assumed that the - // length of the current Payload equals the length of the - // previous Payload. - for (int j = 0; j < postingFreq; j++) { // use delta-encoding - int position = positions[j]; - int delta = position - lastPosition; - if (currentFieldHasPayloads) { - int payloadLength = 0; - Payload payload = null; - if (payloads != null) { - payload = payloads[j]; - if (payload != null) { - payloadLength = payload.length; - } - } - if (payloadLength == lastPayloadLength) { - // the length of the current payload equals the length - // of the previous one. So we do not have to store the length - // again and we only shift the position delta by one bit - prox.writeVInt(delta * 2); - } else { - // the length of the current payload is different from the - // previous one. We shift the position delta, set the lowest - // bit and store the current payload length as VInt. - prox.writeVInt(delta * 2 + 1); - prox.writeVInt(payloadLength); - lastPayloadLength = payloadLength; - } - if (payloadLength > 0) { - // write current payload - prox.writeBytes(payload.data, payload.offset, payload.length); - } - } else { - // field does not store payloads, just write position delta as VInt - prox.writeVInt(delta); - } - lastPosition = position; - } - if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { - termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets); - } - } - if (termVectorWriter != null) - termVectorWriter.closeDocument(); - } finally { - // make an effort to close all streams we can but remember and re-throw - // the first exception encountered in this process - IOException keep = null; - if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; } - if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; } - if (tis != null) try { tis.close(); } catch (IOException e) { if (keep == null) keep = e; } - if (termVectorWriter != null) try { termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; } - if (keep != null) throw (IOException) keep.fillInStackTrace(); - } - } - - private final void writeNorms(String segment) throws IOException { - for(int n = 0; n < fieldInfos.size(); n++){ - FieldInfo fi = fieldInfos.fieldInfo(n); - if(fi.isIndexed && !fi.omitNorms){ - float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]); - IndexOutput norms = directory.createOutput(segment + ".f" + n); - try { - norms.writeByte(Similarity.encodeNorm(norm)); - } finally { - norms.close(); - } - } - } - } - - /** If non-null, a message will be printed to this if maxFieldLength is reached. - */ - void setInfoStream(PrintStream infoStream) { - this.infoStream = infoStream; - } - - int getNumFields() { - return fieldInfos.size(); - } -} - -final class Posting { // info about a Term in a doc - Term term; // the Term - int freq; // its frequency in doc - int[] positions; // positions it occurs at - Payload[] payloads; // the payloads of the terms - TermVectorOffsetInfo [] offsets; - - - Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) { - term = t; - freq = 1; - positions = new int[1]; - positions[0] = position; - - if (payload != null) { - payloads = new Payload[1]; - payloads[0] = payload; - } else - payloads = null; - - - if(offset != null){ - offsets = new TermVectorOffsetInfo[1]; - offsets[0] = offset; - } else - offsets = null; - } -}