> vectorSpace) {
+ this.vectorSpace = vectorSpace;
+ }
+
+ public Document getDocument() {
+ return document;
+ }
+
+
+ public String toString() {
+ return document.toString();
+ }
+}
Index: contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositionVector.java
===================================================================
--- contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositionVector.java (revision 0)
+++ contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositionVector.java (revision 0)
@@ -0,0 +1,46 @@
+package org.apache.lucene.store.instantiated;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+
+import java.io.Serializable;
+
+/**
+ * Date: May 8, 2006
+ * Time: 12:12:31 AM
+ */
+public class InstantiatedTermPositionVector
+ extends InstantiatedTermFreqVector
+ implements TermPositionVector, Serializable {
+
+ private static final long serialVersionUID = 1l;
+
+ public InstantiatedTermPositionVector(InstantiatedDocument document, String field) {
+ super(document, field);
+ }
+
+ public int[] getTermPositions(int index) {
+ return getTermDocumentInformations().get(index).getTermPositions();
+ }
+
+ public TermVectorOffsetInfo[] getOffsets(int index) {
+ return getTermDocumentInformations().get(index).getTermOffsets();
+ }
+
+}
Index: contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositions.java
===================================================================
--- contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositions.java (revision 0)
+++ contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermPositions.java (revision 0)
@@ -0,0 +1,102 @@
+package org.apache.lucene.store.instantiated;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.TermPositions;
+
+import java.io.IOException;
+
+/**
+ *
+ * Date: May 6, 2006
+ * Time: 9:13:23 AM
+ */
+public class InstantiatedTermPositions
+ extends InstantiatedTermDocs
+ implements TermPositions {
+
+ public int getPayloadLength() {
+ return currentDocumentInformation.getPayloads()[currentTermPositionIndex].length;
+ }
+
+ public byte[] getPayload(byte[] data, int offset) throws IOException {
+ byte[] payloads = currentDocumentInformation.getPayloads()[currentTermPositionIndex];
+
+ // read payloads lazily
+ if (data == null || data.length - offset < getPayloadLength()) {
+ // the array is too small to store the payload data,
+ return payloads;
+ } else {
+ System.arraycopy(payloads, 0, data, offset, payloads.length);
+ return data;
+ }
+ }
+
+ public boolean isPayloadAvailable() {
+ return currentDocumentInformation.getPayloads()[currentTermPositionIndex] != null;
+ }
+
+ public InstantiatedTermPositions(InstantiatedIndexReader reader) {
+ super(reader);
+ }
+
+ /**
+ * Returns next position in the current document. It is an error to call
+ * this more than {@link #freq()} times
+ * without calling {@link #next()} This is
+ * invalid until {@link #next()} is called for
+ * the first time.
+ */
+ public int nextPosition() {
+ currentTermPositionIndex++;
+ // if you get an array out of index exception here,
+ // it might be due to currentDocumentInformation.getIndexFromTerm not beeing set!!
+ return currentDocumentInformation.getTermPositions()[currentTermPositionIndex];
+ }
+
+ private int currentTermPositionIndex;
+
+ /**
+ * Moves to the next pair in the enumeration.
+ *
Returns true if there is such a next pair in the enumeration.
+ */
+ @Override
+ public boolean next() {
+ currentTermPositionIndex = -1;
+ return super.next();
+ }
+
+ /**
+ * Skips entries to the first beyond the current whose document number is
+ * greater than or equal to target.
Returns true iff there is such
+ * an entry.
Behaves as if written:
+ * boolean skipTo(int target) {
+ * do {
+ * if (!next())
+ * return false;
+ * } while (target > doc());
+ * return true;
+ * }
+ *
+ * Some implementations are considerably more efficient than that.
+ */
+ @Override
+ public boolean skipTo(int target) {
+ currentTermPositionIndex = -1;
+ return super.skipTo(target);
+ }
+}
Index: contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java
===================================================================
--- contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java (revision 0)
+++ contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocs.java (revision 0)
@@ -0,0 +1,161 @@
+package org.apache.lucene.store.instantiated;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+
+/**
+ * @see org.apache.lucene.index.TermDocs
+ */
+public class InstantiatedTermDocs
+ implements TermDocs {
+
+ private final InstantiatedIndexReader reader;
+
+ public InstantiatedTermDocs(InstantiatedIndexReader reader) {
+ this.reader = reader;
+ }
+
+ private int currentDocumentIndex;
+ protected InstantiatedTermDocumentInformation currentDocumentInformation;
+ protected InstantiatedTerm currentTerm;
+
+ /**
+ * Sets this to the data for a term.
+ * The enumeration is reset to the start of the data for this term.
+ */
+ public void seek(Term term) {
+ currentTerm = reader.getIndex().findTerm(term);
+ currentDocumentIndex = -1;
+ }
+
+ /**
+ * Sets this to the data for the current term in a {@link org.apache.lucene.index.TermEnum}.
+ * This may be optimized in some implementations.
+ */
+ public void seek(org.apache.lucene.index.TermEnum termEnum) {
+ seek(termEnum.term());
+ }
+
+ /**
+ * Returns the current document number. This is invalid until {@link
+ * #next()} is called for the first time.
+ */
+ public int doc() {
+ return currentDocumentInformation.getDocument().getDocumentNumber();
+ }
+
+ /**
+ * Returns the frequency of the term within the current document.
This
+ * is invalid until {@link #next()} is called for the first time.
+ */
+ public int freq() {
+ return currentDocumentInformation.getTermPositions().length;
+ }
+
+ /**
+ * Moves to the next pair in the enumeration.
Returns true iff there is
+ * such a next pair in the enumeration.
+ */
+ public boolean next() {
+ if (currentTerm != null) {
+ currentDocumentIndex++;
+ if (currentDocumentIndex < currentTerm.getAssociatedDocuments().length) {
+ currentDocumentInformation = currentTerm.getAssociatedDocuments()[currentDocumentIndex];
+ if (reader.hasDeletions() && reader.isDeleted(currentDocumentInformation.getDocument().getDocumentNumber())) {
+ return next();
+ } else {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Attempts to read multiple entries from the enumeration, up to length of
+ * docs. Document numbers are stored in docs, and term
+ * frequencies are stored in freqs. The freqs array must be as
+ * long as the docs array.
+ *
+ * Returns the number of entries read. Zero is only returned when the
+ * stream has been exhausted.
+ */
+ public int read(int[] docs, int[] freqs) {
+ int i;
+ for (i = 0; i < docs.length; i++) {
+ if (!next()) {
+ break;
+ }
+ docs[i] = doc();
+ freqs[i] = freq();
+ }
+ return i;
+ }
+
+ /**
+ * Skips entries to the first beyond the current whose document number is
+ * greater than or equal to target.
Returns true if there is such
+ * an entry.
Behaves as if written:
+ * boolean skipTo(int target) {
+ * do {
+ * if (!next())
+ * return false;
+ * } while (target > doc());
+ * return true;
+ * }
+ *
+ * Some implementations are considerably more efficient than that.
+ *
+ */
+ public boolean skipTo(int target) {
+ if (currentTerm == null) {
+ return false;
+ }
+
+ if (currentDocumentIndex >= target) {
+ return next();
+ }
+
+ int startOffset = currentDocumentIndex >= 0 ? currentDocumentIndex : 0;
+ int pos = currentTerm.seekCeilingDocumentInformationIndex(target, startOffset);
+
+ if (pos == -1) {
+ return false;
+ }
+
+ currentDocumentInformation = currentTerm.getAssociatedDocuments()[pos];
+ currentDocumentIndex = pos;
+ if (reader.hasDeletions() && reader.isDeleted(currentDocumentInformation.getDocument().getDocumentNumber())) {
+ return next();
+ } else {
+ return true;
+ }
+
+
+ }
+
+ /**
+ * Frees associated resources.
+ */
+ public void close() {
+
+ }
+
+
+}
Index: contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java
===================================================================
--- contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (revision 0)
+++ contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexWriter.java (revision 0)
@@ -0,0 +1,674 @@
+package org.apache.lucene.store.instantiated;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Similarity;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.StringReader;
+import java.util.*;
+
+/**
+ * @see org.apache.lucene.index.IndexWriter
+ *
+ * Date: May 6, 2006
+ * Time: 8:41:02 AM
+ */
+public class InstantiatedIndexWriter {
+
+ private PrintStream infoStream = null;
+
+ private int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH;
+
+ private final InstantiatedIndex index;
+ private final Analyzer analyzer;
+
+ private Similarity similarity = Similarity.getDefault(); // how to normalize;
+
+ private transient Set fieldNameBuffer;
+ /**
+ * linked to ensure chronological order
+ */
+ private Map>> termDocumentInformationFactoryByDocument = new LinkedHashMap>>(2000);
+
+ private Set unflushedDocuments = new HashSet();
+
+ public InstantiatedIndexWriter(InstantiatedIndex index) throws IOException {
+ this(index, null);
+ }
+
+ public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer) throws IOException {
+ this(index, analyzer, false);
+ }
+
+ public InstantiatedIndexWriter(InstantiatedIndex index, Analyzer analyzer, boolean create) throws IOException {
+ this.index = index;
+ this.analyzer = analyzer;
+ fieldNameBuffer = new HashSet();
+ if (create) {
+ this.index.initialize();
+ }
+ }
+
+ private int mergeFactor = 2500;
+
+ /**
+ * The sweet spot for this implementation is somewhere around 2500 at 2K large documents.
+ *
+ * Benchmark output:
+ *
+ * ------------> Report sum by Prefix (MAddDocs) and Round (8 about 8 out of 160153)
+ * Operation round mrg buf cmpnd runCnt recsPerRun rec/s elapsedSec avgUsedMem avgTotalMem
+ * MAddDocs_20000 0 10 10 true 1 20000 81,4 245,68 200 325 152 268 156 928
+ * MAddDocs_20000 - 1 1000 10 true - - 1 - - 20000 - - 494,1 - - 40,47 - 247 119 072 - 347 025 408
+ * MAddDocs_20000 2 10 100 true 1 20000 104,8 190,81 233 895 552 363 720 704
+ * MAddDocs_20000 - 3 2000 100 true - - 1 - - 20000 - - 527,2 - - 37,94 - 266 136 448 - 378 273 792
+ * MAddDocs_20000 4 10 10 false 1 20000 103,2 193,75 222 089 792 378 273 792
+ * MAddDocs_20000 - 5 3000 10 false - - 1 - - 20000 - - 545,2 - - 36,69 - 237 917 152 - 378 273 792
+ * MAddDocs_20000 6 10 100 false 1 20000 102,7 194,67 237 018 976 378 273 792
+ * MAddDocs_20000 - 7 4000 100 false - - 1 - - 20000 - - 535,8 - - 37,33 - 309 680 640 - 501 968 896
+ *
+ *
+ * @see org.apache.lucene.index.IndexWriter#setMergeFactor(int)
+ */
+ public void setMergeFactor(int mergeFactor) {
+ this.mergeFactor = mergeFactor;
+ }
+
+ /**
+ * @see org.apache.lucene.index.IndexWriter#getMergeFactor()
+ */
+ public int getMergeFactor() {
+ return mergeFactor;
+ }
+
+
+ /**
+ * If non-null, information about merges and a message when
+ * maxFieldLength is reached will be printed to this.
+ */
+ public void setInfoStream(PrintStream infoStream) {
+ this.infoStream = infoStream;
+ }
+
+
+ public void abort() throws IOException {
+ // what not
+ }
+
+
+ public void addIndexes(IndexReader[] readers) {
+ throw new RuntimeException("Not implemented");
+ }
+
+
+ public PrintStream getInfoStream() {
+ return infoStream;
+ }
+
+
+ /**
+ * Flushes all changes to an index and closes all associated files.
+ */
+ public void close() throws IOException {
+ commit();
+ }
+
+ /**
+ * Returns the number of documents currently in this index.
+ */
+ public int docCount() {
+ // todo: not certain. see http://www.nabble.com/IndexWriter.docCount-tf3128882.html#a8669483
+ return index.getDocumentsByNumber().length /* - index.getDeletedDocuments().size() */ + unflushedDocuments.size();
+ }
+
+ /**
+ * Locks the index and commits the buffered documents.
+ */
+ public void commit() throws IOException {
+
+ // todo write lock, unless held by caller
+
+ boolean orderedTermsDirty = false;
+ Set dirtyTerms = new HashSet(1000);
+
+ InstantiatedDocument[] documentsByNumber = new InstantiatedDocument[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
+ System.arraycopy(index.getDocumentsByNumber(), 0, documentsByNumber, 0, index.getDocumentsByNumber().length);
+ int documentNumber = index.getDocumentsByNumber().length;
+
+ List orderedTerms = new ArrayList(index.getOrderedTerms().length + 5000);
+ for (InstantiatedTerm instantiatedTerm : index.getOrderedTerms()) {
+ orderedTerms.add(instantiatedTerm);
+ }
+
+ // update norm array with fake values for new documents
+ Map normsByFieldNameAndDocumentNumber = new HashMap(index.getTermsByFieldAndText().size());
+ Set fieldNames = new HashSet(20);
+ fieldNames.addAll(index.getNormsByFieldNameAndDocumentNumber().keySet());
+ fieldNames.addAll(fieldNameBuffer);
+ for (String field : index.getTermsByFieldAndText().keySet()) {
+ byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
+ byte[] oldNorms = index.getNormsByFieldNameAndDocumentNumber().get(field);
+ if (oldNorms != null) {
+ System.arraycopy(oldNorms, 0, norms, 0, oldNorms.length);
+ Arrays.fill(norms, oldNorms.length, norms.length, DefaultSimilarity.encodeNorm(1.0f));
+ } else {
+ Arrays.fill(norms, 0, norms.length, DefaultSimilarity.encodeNorm(1.0f));
+ }
+ normsByFieldNameAndDocumentNumber.put(field, norms);
+ fieldNames.remove(field);
+ }
+ for (String field : fieldNames) {
+ //System.out.println(field);
+ byte[] norms = new byte[index.getDocumentsByNumber().length + termDocumentInformationFactoryByDocument.size()];
+ Arrays.fill(norms, 0, norms.length, DefaultSimilarity.encodeNorm(1.0f));
+ normsByFieldNameAndDocumentNumber.put(field, norms);
+ }
+ fieldNames.clear();
+ index.setNormsByFieldNameAndDocumentNumber(normsByFieldNameAndDocumentNumber);
+
+ for (Map.Entry>> eDocumentTermDocInfoByTermTextAndField : termDocumentInformationFactoryByDocument.entrySet()) {
+
+ InstantiatedDocument document = eDocumentTermDocInfoByTermTextAndField.getKey();
+
+ // assign document number
+ document.setDocumentNumber(documentNumber++);
+ documentsByNumber[document.getDocumentNumber()] = document;
+
+ // set norms, prepare document and create optimized size collections.
+
+ int numFieldsWithTermVectorsInDocument = 0;
+ int termsInDocument = 0;
+ for (Map.Entry> eFieldTermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) {
+ if (eFieldTermDocInfoFactoriesByTermText.getKey().storeTermVector) {
+ numFieldsWithTermVectorsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size();
+ }
+ termsInDocument += eFieldTermDocInfoFactoriesByTermText.getValue().size();
+
+ if (eFieldTermDocInfoFactoriesByTermText.getKey().isIndexed && !eFieldTermDocInfoFactoriesByTermText.getKey().omitNorms) {
+ float norm = eFieldTermDocInfoFactoriesByTermText.getKey().boost;
+ norm *= document.getDocument().getBoost();
+ norm *= similarity.lengthNorm(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName, eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength);
+ normsByFieldNameAndDocumentNumber.get(eFieldTermDocInfoFactoriesByTermText.getKey().fieldName)[document.getDocumentNumber()] = Similarity.encodeNorm(norm);
+ } else {
+ System.currentTimeMillis();
+ }
+
+ }
+
+ /** used for term vectors only, i think.. */
+ Map informationByTermOfCurrentDocument = new HashMap(termsInDocument);
+
+
+ Map documentFieldSettingsByFieldName = new HashMap(eDocumentTermDocInfoByTermTextAndField.getValue().size());
+
+ // terms...
+ for (Map.Entry> eFieldSetting_TermDocInfoFactoriesByTermText : eDocumentTermDocInfoByTermTextAndField.getValue().entrySet()) {
+ documentFieldSettingsByFieldName.put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eFieldSetting_TermDocInfoFactoriesByTermText.getKey());
+
+ // find or create term
+ for (Map.Entry eTermText_TermDocInfoFactory : eFieldSetting_TermDocInfoFactoriesByTermText.getValue().entrySet()) {
+
+ // get term..
+ InstantiatedTerm term;
+ Map termsByText = index.getTermsByFieldAndText().get(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName);
+ if (termsByText == null) {
+ termsByText = new HashMap(1000);
+ index.getTermsByFieldAndText().put(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, termsByText);
+ term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey());
+ termsByText.put(eTermText_TermDocInfoFactory.getKey(), term);
+ int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator);
+ pos = -1 - pos;
+ orderedTerms.add(pos, term);
+ orderedTermsDirty = true;
+ } else {
+ term = termsByText.get(eTermText_TermDocInfoFactory.getKey());
+ if (term == null) {
+ term = new InstantiatedTerm(eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName, eTermText_TermDocInfoFactory.getKey());
+ termsByText.put(eTermText_TermDocInfoFactory.getKey(), term);
+ int pos = Collections.binarySearch(orderedTerms, term, InstantiatedTerm.comparator);
+ pos = -1 - pos;
+ orderedTerms.add(pos, term);
+ orderedTermsDirty = true;
+ }
+ }
+
+ // create association term document infomation
+ //
+ // [Term]-- {0..*} | {0..* ordered} --(field)[Document]
+ //
+ // |
+ // [TermDocumentInformation]
+
+ int[] positions = new int[eTermText_TermDocInfoFactory.getValue().termPositions.size()];
+ for (int i = 0; i < positions.length; i++) {
+ positions[i] = eTermText_TermDocInfoFactory.getValue().termPositions.get(i);
+ }
+
+ byte[][] payloads = new byte[eTermText_TermDocInfoFactory.getValue().payloads.size()][];
+ for (int i = 0; i < payloads.length; i++) {
+ payloads[i] = eTermText_TermDocInfoFactory.getValue().payloads.get(i);
+ }
+
+ // couple
+
+ InstantiatedTermDocumentInformation info = new InstantiatedTermDocumentInformation(term, document, /*eTermText_TermDocInfoFactory.getValue().termFrequency,*/ positions, payloads);
+
+ // todo optimize, this should be chached and updated to array in batches rather than appending the array once for every position!
+ InstantiatedTermDocumentInformation[] associatedDocuments;
+ if (term.getAssociatedDocuments() != null) {
+ associatedDocuments = new InstantiatedTermDocumentInformation[term.getAssociatedDocuments().length + 1];
+ System.arraycopy(term.getAssociatedDocuments(), 0, associatedDocuments, 0, term.getAssociatedDocuments().length);
+ } else {
+ associatedDocuments = new InstantiatedTermDocumentInformation[1];
+ }
+ associatedDocuments[associatedDocuments.length - 1] = info;
+ term.setAssociatedDocuments(associatedDocuments);
+
+ // todo optimize, only if term vector?
+ informationByTermOfCurrentDocument.put(term, info);
+
+
+ dirtyTerms.add(term);
+ }
+
+ // term vector offsets
+ if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().storeOffsetWithTermVector) {
+ for (Map.Entry e : informationByTermOfCurrentDocument.entrySet()) {
+ if (eFieldSetting_TermDocInfoFactoriesByTermText.getKey().fieldName.equals(e.getKey().field())) {
+ TermDocumentInformationFactory factory = eFieldSetting_TermDocInfoFactoriesByTermText.getValue().get(e.getKey().text());
+ e.getValue().setTermOffsets(factory.termOffsets.toArray(new TermVectorOffsetInfo[0]));
+ }
+ }
+ }
+ }
+
+ Map> termDocumentInformationsByField = new HashMap>();
+ for (Map.Entry eTerm_TermDocumentInformation : informationByTermOfCurrentDocument.entrySet()) {
+ List termDocumentInformations = termDocumentInformationsByField.get(eTerm_TermDocumentInformation.getKey().field());
+ if (termDocumentInformations == null) {
+ termDocumentInformations = new ArrayList();
+ termDocumentInformationsByField.put(eTerm_TermDocumentInformation.getKey().field(), termDocumentInformations);
+ }
+ termDocumentInformations.add(eTerm_TermDocumentInformation.getValue());
+ }
+
+ for (Map.Entry> eField_TermDocInfos : termDocumentInformationsByField.entrySet()) {
+
+ Collections.sort(eField_TermDocInfos.getValue(), new Comparator() {
+ public int compare(InstantiatedTermDocumentInformation instantiatedTermDocumentInformation, InstantiatedTermDocumentInformation instantiatedTermDocumentInformation1) {
+ return instantiatedTermDocumentInformation.getTerm().getTerm().compareTo(instantiatedTermDocumentInformation1.getTerm().getTerm());
+ }
+ });
+
+ // add term vector
+ if (documentFieldSettingsByFieldName.get(eField_TermDocInfos.getKey()).storeTermVector) {
+ if (document.getVectorSpace() == null) {
+ document.setVectorSpace(new HashMap>(documentFieldSettingsByFieldName.size()));
+ }
+ document.getVectorSpace().put(eField_TermDocInfos.getKey(), eField_TermDocInfos.getValue());
+ }
+
+ }
+ }
+
+ // order document informations in dirty terms
+ for (InstantiatedTerm term : dirtyTerms) {
+ // todo optimize, i belive this is useless, that the natural order is document number?
+ Arrays.sort(term.getAssociatedDocuments(), InstantiatedTermDocumentInformation.documentNumberComparator);
+
+// // update association class reference for speedy skipTo()
+// for (int i = 0; i < term.getAssociatedDocuments().length; i++) {
+// term.getAssociatedDocuments()[i].setIndexFromTerm(i);
+// }
+ }
+
+
+ // flush to writer
+ index.setDocumentsByNumber(documentsByNumber);
+ index.setOrderedTerms(orderedTerms.toArray(new InstantiatedTerm[0]));
+
+ // set term index
+ if (orderedTermsDirty) {
+ // todo optimize, only update from start position
+ for (int i = 0; i < index.getOrderedTerms().length; i++) {
+ index.getOrderedTerms()[i].setTermIndex(i);
+ }
+
+ }
+
+ // remove deleted documents
+ IndexReader indexDeleter = index.indexReaderFactory();
+ if (unflushedDeletions.size() > 0) {
+ for (Term term : unflushedDeletions) {
+ indexDeleter.deleteDocuments(term);
+ }
+ unflushedDeletions.clear();
+ }
+
+
+ // all done, clear buffers
+ unflushedDocuments.clear();
+ termDocumentInformationFactoryByDocument.clear();
+ fieldNameBuffer.clear();
+
+ index.setVersion(System.currentTimeMillis());
+
+ // todo unlock
+
+ indexDeleter.close();
+
+ }
+
+ /**
+ * Adds a document to this index. If the document contains more than
+ * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
+ * discarded.
+ */
+ public void addDocument(Document doc) throws IOException {
+ addDocument(doc, getAnalyzer());
+ }
+
+ /**
+ * Adds a document to this index, using the provided analyzer instead of the
+ * value of {@link #getAnalyzer()}. If the document contains more than
+ * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are
+ * discarded.
+ *
+ * @param doc
+ * @param analyzer
+ * @throws IOException
+ */
+ public void addDocument(Document doc, Analyzer analyzer) throws IOException {
+ addDocument(new InstantiatedDocument(doc), analyzer);
+ }
+
+ /**
+ * Tokenizes a document and adds it to the buffer.
+ * Try to do all calculations in this method rather than in commit, as this is a non locking method.
+ * Remember, this index implementation expects unlimited memory for maximum speed.
+ *
+ * @param document
+ * @param analyzer
+ * @throws IOException
+ */
+ protected void addDocument(InstantiatedDocument document, Analyzer analyzer) throws IOException {
+
+ if (document.getDocumentNumber() != null) {
+ throw new RuntimeException("Document number already set! Are you trying to add a document that already is bound to this or another index?");
+ }
+
+ // todo: write lock
+
+ // normalize settings per field name in document
+
+ Map fieldSettingsByFieldName = new HashMap();
+ for (Field field : (List) document.getDocument().getFields()) {
+ FieldSetting fieldSettings = fieldSettingsByFieldName.get(field.name());
+ if (fieldSettings == null) {
+ fieldSettings = new FieldSetting();
+ fieldSettings.fieldName = field.name().intern();
+ fieldSettingsByFieldName.put(fieldSettings.fieldName, fieldSettings);
+ fieldNameBuffer.add(fieldSettings.fieldName);
+ }
+
+ // todo: fixme: multiple fields with the same name does not mean field boost += more boost.
+ fieldSettings.boost *= field.getBoost();
+ //fieldSettings.dimensions++;
+
+ // once fieldSettings, always fieldSettings.
+ if (field.getOmitNorms() != fieldSettings.omitNorms) {
+ fieldSettings.omitNorms = true;
+ }
+ if (field.isIndexed() != fieldSettings.isIndexed) {
+ fieldSettings.isIndexed = true;
+ }
+ if (field.isTokenized() != fieldSettings.isTokenized) {
+ fieldSettings.isTokenized = true;
+ }
+ if (field.isCompressed() != fieldSettings.isCompressed) {
+ fieldSettings.isCompressed = true;
+ }
+ if (field.isStored() != fieldSettings.isStored) {
+ fieldSettings.isStored = true;
+ }
+ if (field.isBinary() != fieldSettings.isBinary) {
+ fieldSettings.isBinary = true;
+ }
+ if (field.isTermVectorStored() != fieldSettings.storeTermVector) {
+ fieldSettings.storeTermVector = true;
+ }
+ if (field.isStorePositionWithTermVector() != fieldSettings.storePositionWithTermVector) {
+ fieldSettings.storePositionWithTermVector = true;
+ }
+ if (field.isStoreOffsetWithTermVector() != fieldSettings.storeOffsetWithTermVector) {
+ fieldSettings.storeOffsetWithTermVector = true;
+ }
+ }
+
+ Map> tokensByField = new LinkedHashMap>(20);
+
+ // tokenize indexed fields.
+ for (Iterator it = (Iterator) document.getDocument().getFields().iterator(); it.hasNext();) {
+
+ Field field = it.next();
+
+ FieldSetting fieldSettings = fieldSettingsByFieldName.get(field.name());
+
+ if (field.isIndexed()) {
+
+ LinkedList tokens = new LinkedList();
+ tokensByField.put(field, tokens);
+
+ if (field.isTokenized()) {
+ int termCounter = 0;
+ final TokenStream tokenStream;
+ // todo readerValue(), binaryValue()
+ if (field.tokenStreamValue() != null) {
+ tokenStream = field.tokenStreamValue();
+ } else {
+ tokenStream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
+ }
+ Token next = tokenStream.next();
+
+ while (next != null) {
+ next.setTermText(next.termText().intern()); // todo: not sure this needs to be interned?
+ tokens.add(next); // the vector will be built on commit.
+ next = tokenStream.next();
+ fieldSettings.fieldLength++;
+ if (fieldSettings.fieldLength > maxFieldLength) {
+ break;
+ }
+ }
+ } else {
+ // untokenized
+ tokens.add(new Token(field.stringValue().intern(), 0, field.stringValue().length(), "untokenized"));
+ fieldSettings.fieldLength++;
+ }
+ }
+
+ if (!field.isStored()) {
+ it.remove();
+ }
+ }
+
+
+ Map> termDocumentInformationFactoryByTermTextAndFieldSetting = new HashMap>();
+ termDocumentInformationFactoryByDocument.put(document, termDocumentInformationFactoryByTermTextAndFieldSetting);
+
+ // build term vector, term positions and term offsets
+ for (Map.Entry> eField_Tokens : tokensByField.entrySet()) {
+ FieldSetting fieldSettings = fieldSettingsByFieldName.get(eField_Tokens.getKey().name());
+
+ Map termDocumentInformationFactoryByTermText = termDocumentInformationFactoryByTermTextAndFieldSetting.get(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()));
+ if (termDocumentInformationFactoryByTermText == null) {
+ termDocumentInformationFactoryByTermText = new HashMap();
+ termDocumentInformationFactoryByTermTextAndFieldSetting.put(fieldSettingsByFieldName.get(eField_Tokens.getKey().name()), termDocumentInformationFactoryByTermText);
+ }
+
+ int lastOffset = 0;
+
+ // for each new field, move positions a bunch.
+ if (fieldSettings.position > 0) {
+ // todo what if no analyzer set, multiple fields with same name and index without tokenization?
+ fieldSettings.position += analyzer.getPositionIncrementGap(fieldSettings.fieldName);
+ }
+
+ for (Token token : eField_Tokens.getValue()) {
+
+ TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.termText());
+ if (termDocumentInformationFactory == null) {
+ termDocumentInformationFactory = new TermDocumentInformationFactory();
+ termDocumentInformationFactoryByTermText.put(token.termText(), termDocumentInformationFactory);
+ }
+ //termDocumentInformationFactory.termFrequency++;
+
+ fieldSettings.position += (token.getPositionIncrement() - 1);
+ termDocumentInformationFactory.termPositions.add(fieldSettings.position++);
+
+ if (token.getPayload() != null && token.getPayload().length() > 0) {
+ termDocumentInformationFactory.payloads.add(token.getPayload().toByteArray());
+ } else {
+ termDocumentInformationFactory.payloads.add(null);
+ }
+
+ if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
+
+ termDocumentInformationFactory.termOffsets.add(new TermVectorOffsetInfo(fieldSettings.offset + token.startOffset(), fieldSettings.offset + token.endOffset()));
+ lastOffset = fieldSettings.offset + token.endOffset();
+ }
+
+
+ }
+
+ if (eField_Tokens.getKey().isStoreOffsetWithTermVector()) {
+ fieldSettings.offset = lastOffset + 1;
+ }
+
+ }
+
+
+ unflushedDocuments.add(document);
+
+ // if too many documents in buffer, commit.
+ if (unflushedDocuments.size() >= getMergeFactor()) {
+ commit(/*lock*/);
+ }
+
+ // todo: unlock write lock
+
+ }
+
+
+ private Set unflushedDeletions = new HashSet();
+
+ public void deleteDocuments(Term term) throws IOException {
+ unflushedDeletions.add(term);
+ }
+
+ public void deleteDocuments(Term[] terms) throws IOException {
+ for (Term term : terms) {
+ deleteDocuments(term);
+ }
+ }
+
+ public void updateDocument(Term term, Document doc) throws IOException {
+ updateDocument(term, doc, getAnalyzer());
+ }
+
+ public void updateDocument(Term term, Document doc, Analyzer analyzer) throws IOException {
+ deleteDocuments(term);
+ addDocument(doc, analyzer);
+ }
+
+ public int getMaxFieldLength() {
+ return maxFieldLength;
+ }
+
+ public void setMaxFieldLength(int maxFieldLength) {
+ this.maxFieldLength = maxFieldLength;
+ }
+
+ public Similarity getSimilarity() {
+ return similarity;
+ }
+
+ public void setSimilarity(Similarity similarity) {
+ this.similarity = similarity;
+ }
+
+ public Analyzer getAnalyzer() {
+ return analyzer;
+ }
+
+
+ private class FieldSetting {
+ private String fieldName;
+
+ private float boost = 1;
+ //private int dimensions = 0; // this is futuristic
+ private int position = 0;
+ private int offset;
+ private int fieldLength = 0;
+
+ private boolean storeTermVector = false;
+ private boolean storeOffsetWithTermVector = false;
+ private boolean storePositionWithTermVector = false;
+ private boolean omitNorms = false;
+ private boolean isTokenized = false;
+
+ private boolean isStored = false;
+ private boolean isIndexed = false;
+ private boolean isBinary = false;
+ private boolean isCompressed = false;
+
+ //private float norm;
+ //private byte encodedNorm;
+
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ final FieldSetting that = (FieldSetting) o;
+
+ return fieldName.equals(that.fieldName);
+
+ }
+
+ public int hashCode() {
+ return fieldName.hashCode();
+ }
+ }
+
+ private class TermDocumentInformationFactory {
+ private LinkedList payloads = new LinkedList();
+ private LinkedList termPositions = new LinkedList();
+ private LinkedList termOffsets = new LinkedList();
+ }
+
+}
Index: contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html
===================================================================
--- contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html (revision 0)
+++ contrib/instantiated/src/java/org/apache/lucene/store/instantiated/package.html (revision 0)
@@ -0,0 +1,89 @@
+
+
+
+
+ InstantiatedIndex
+
+
+Abstract
+
+
+ Represented as a coupled graph of class instances, this
+ all-in-memory index store implementation delivers search
+ results up to a 100 times faster than the file-centric RAMDirectory
+ at the cost of greater RAM consumption.
+
+
+API
+
+
+ Just as the default store implementation, InstantiatedIndex
+ comes with an IndexReader and IndexWriter. The latter share
+ many method signatures with the file-centric IndexWriter.
+
+
+
+ It is also possible to load the content of another index
+ by passing an IndexReader to the InstantiatedIndex constructor.
+
+
+Performance
+
+
+ At a few thousand ~160 characters long documents
+ InstantiaedIndex outperforms RAMDirectory some 50x,
+ 15x at 100 documents of 2000 charachters length,
+ and is linear to RAMDirectory at 10,000 documents of 2000 characters length.
+
+
+Mileage may vary depending on term saturation.
+
+
+ Populated with a single document InstantiatedIndex is almost, but not quite, as fast as MemoryIndex.
+
+
+
+ It takes more or less the same time to populate an InstantiatedIndex
+ as it takes to populate a RAMDirectory. Hardly any effort has been put
+ in to optimizing the InstantiatedIndexWriter, only minimizing the amount
+ of time needed to write-lock the index has been considered.
+
+
+Caveats
+
+ - No locks, so don't add new documents while searching!
+ - No documents with fields containing readers!
+ - Only FieldOption.All allowed by IndexReader#getFieldNames(FieldOption).
+ - No field selection when retreiving documents, as all stored field are available in memory.
+
+
+Use cases
+
+
+ Could replace any small index that could do with greater response time
+ such as a spell check a priori index,
+ the index of new documents exposed to user search agent queries,
+ to compile classifiers in machine learning environments, et c.
+
+
+Class diagram
+
+
+Diagram rendered using UMLet 7.1.
+
+
\ No newline at end of file
Index: contrib/instantiated/build.xml
===================================================================
--- contrib/instantiated/build.xml (revision 0)
+++ contrib/instantiated/build.xml (revision 0)
@@ -0,0 +1,32 @@
+
+
+
+
+
+
+
+ InstantiatedIndex, an alternative RAM store.
+
+
+
+
+
+
+
+
+