Index: IndexWriter.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/IndexWriter.java,v
retrieving revision 1.44
diff -u -r1.44 IndexWriter.java
--- IndexWriter.java 12 Dec 2004 20:26:27 -0000 1.44
+++ IndexWriter.java 14 Dec 2004 23:36:31 -0000
@@ -31,709 +31,860 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.analysis.Analyzer;
-
/**
- An IndexWriter creates and maintains an index.
-
- The third argument to the
- constructor
- determines whether a new index is created, or whether an existing index is
- opened for the addition of new documents.
-
- In either case, documents are added with the addDocument method.
- When finished adding documents, close should be called.
-
- If an index will not have more documents added for a while and optimal search
- performance is desired, then the optimize
- method should be called before the index is closed.
- */
+ * An IndexWriter creates and maintains an index.
+ *
+ * The third argument to the constructor
+ * determines whether a new index is created, or whether an existing index is
+ * opened for the addition of new documents.
+ *
+ * In either case, documents are added with the addDocument
+ * method. When finished adding documents, close
+ * should be called.
+ *
+ * If an index will not have more documents added for a while and optimal search
+ * performance is desired, then the optimize
+ * method should be called before the index is closed.
+ */
public class IndexWriter {
- /**
- * Default value is 1000. Use org.apache.lucene.writeLockTimeout
- * system property to override.
- */
- public static long WRITE_LOCK_TIMEOUT =
- Integer.parseInt(System.getProperty("org.apache.lucene.writeLockTimeout",
- "1000"));
-
- /**
- * Default value is 10000. Use org.apache.lucene.commitLockTimeout
- * system property to override.
- */
- public static long COMMIT_LOCK_TIMEOUT =
- Integer.parseInt(System.getProperty("org.apache.lucene.commitLockTimeout",
- "10000"));
-
- public static final String WRITE_LOCK_NAME = "write.lock";
- public static final String COMMIT_LOCK_NAME = "commit.lock";
-
- /**
- * Default value is 10. Use org.apache.lucene.mergeFactor
- * system property to override.
- */
- public static final int DEFAULT_MERGE_FACTOR =
- Integer.parseInt(System.getProperty("org.apache.lucene.mergeFactor",
- "10"));
-
- /**
- * Default value is 10. Use org.apache.lucene.minMergeDocs
- * system property to override.
- */
- public static final int DEFAULT_MIN_MERGE_DOCS =
- Integer.parseInt(System.getProperty("org.apache.lucene.minMergeDocs",
- "10"));
-
- /**
- * Default value is {@link Integer#MAX_VALUE}.
- * Use org.apache.lucene.maxMergeDocs system property to override.
- */
- public static final int DEFAULT_MAX_MERGE_DOCS =
- Integer.parseInt(System.getProperty("org.apache.lucene.maxMergeDocs",
- String.valueOf(Integer.MAX_VALUE)));
-
- /**
- * Default value is 10000. Use org.apache.lucene.maxFieldLength
- * system property to override.
- */
- public static final int DEFAULT_MAX_FIELD_LENGTH =
- Integer.parseInt(System.getProperty("org.apache.lucene.maxFieldLength",
- "10000"));
-
-
- private Directory directory; // where this index resides
- private Analyzer analyzer; // how to analyze text
-
- private Similarity similarity = Similarity.getDefault(); // how to normalize
-
- private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
- private final Directory ramDirectory = new RAMDirectory(); // for temp segs
-
- private Lock writeLock;
-
- /** Use compound file setting. Defaults to true, minimizing the number of
- * files used. Setting this to false may improve indexing performance, but
- * may also cause file handle problems.
- */
- private boolean useCompoundFile = true;
-
- private boolean closeDir;
-
- /** Get the current setting of whether to use the compound file format.
- * Note that this just returns the value you set with setUseCompoundFile(boolean)
- * or the default. You cannot use this to query the status of an existing index.
- * @see #setUseCompoundFile(boolean)
- */
- public boolean getUseCompoundFile() {
- return useCompoundFile;
- }
-
- /** Setting to turn on usage of a compound file. When on, multiple files
- * for each segment are merged into a single file once the segment creation
- * is finished. This is done regardless of what directory is in use.
- */
- public void setUseCompoundFile(boolean value) {
- useCompoundFile = value;
- }
-
- /** Expert: Set the Similarity implementation used by this IndexWriter.
- *
- * @see Similarity#setDefault(Similarity)
- */
- public void setSimilarity(Similarity similarity) {
- this.similarity = similarity;
- }
-
- /** Expert: Return the Similarity implementation used by this IndexWriter.
- *
- *
This defaults to the current value of {@link Similarity#getDefault()}.
- */
- public Similarity getSimilarity() {
- return this.similarity;
- }
-
- /**
- * Constructs an IndexWriter for the index in path.
- * Text will be analyzed with a. If create
- * is true, then a new, empty index will be created in
- * path, replacing the index already there, if any.
- *
- * @param path the path to the index directory
- * @param a the analyzer to use
- * @param create true to create the index or overwrite
- * the existing one; false to append to the existing
- * index
- * @throws IOException if the directory cannot be read/written to, or
- * if it does not exist, and create is
- * false
- */
- public IndexWriter(String path, Analyzer a, boolean create)
- throws IOException {
- this(FSDirectory.getDirectory(path, create), a, create, true);
- }
-
- /**
- * Constructs an IndexWriter for the index in path.
- * Text will be analyzed with a. If create
- * is true, then a new, empty index will be created in
- * path, replacing the index already there, if any.
- *
- * @param path the path to the index directory
- * @param a the analyzer to use
- * @param create true to create the index or overwrite
- * the existing one; false to append to the existing
- * index
- * @throws IOException if the directory cannot be read/written to, or
- * if it does not exist, and create is
- * false
- */
- public IndexWriter(File path, Analyzer a, boolean create)
- throws IOException {
- this(FSDirectory.getDirectory(path, create), a, create, true);
- }
-
- /**
- * Constructs an IndexWriter for the index in d.
- * Text will be analyzed with a. If create
- * is true, then a new, empty index will be created in
- * d, replacing the index already there, if any.
- *
- * @param d the index directory
- * @param a the analyzer to use
- * @param create true to create the index or overwrite
- * the existing one; false to append to the existing
- * index
- * @throws IOException if the directory cannot be read/written to, or
- * if it does not exist, and create is
- * false
- */
- public IndexWriter(Directory d, Analyzer a, boolean create)
- throws IOException {
- this(d, a, create, false);
- }
-
- private IndexWriter(Directory d, Analyzer a, final boolean create, boolean closeDir)
- throws IOException {
- this.closeDir = closeDir;
- directory = d;
- analyzer = a;
-
- Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME);
- if (!writeLock.obtain(WRITE_LOCK_TIMEOUT)) // obtain write lock
- throw new IOException("Index locked for write: " + writeLock);
- this.writeLock = writeLock; // save it
-
- synchronized (directory) { // in- & inter-process sync
- new Lock.With(directory.makeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) {
- public Object doBody() throws IOException {
- if (create)
- segmentInfos.write(directory);
- else
- segmentInfos.read(directory);
- return null;
- }
- }.run();
- }
- }
-
- /** Determines the largest number of documents ever merged by addDocument().
- * Small values (e.g., less than 10,000) are best for interactive indexing,
- * as this limits the length of pauses while indexing to a few seconds.
- * Larger values are best for batched indexing and speedier searches.
- *
- *
The default value is {@link Integer#MAX_VALUE}. - */ - public void setMaxMergeDocs(int maxMergeDocs) { - this.maxMergeDocs = maxMergeDocs; - } - - /** - * @see #setMaxMergeDocs - */ - public int getMaxMergeDocs() { - return maxMergeDocs; - } - - /** - * The maximum number of terms that will be indexed for a single field in a - * document. This limits the amount of memory required for indexing, so that - * collections with very large files will not crash the indexing process by - * running out of memory.
- * Note that this effectively truncates large documents, excluding from the - * index terms that occur further in the document. If you know your source - * documents are large, be sure to set this value high enough to accomodate - * the expected size. If you set it to Integer.MAX_VALUE, then the only limit - * is your memory, but you should anticipate an OutOfMemoryError. - * By default, no more than 10,000 terms will be indexed for a field. - */ - public void setMaxFieldLength(int maxFieldLength) { - this.maxFieldLength = maxFieldLength; - } - - /** - * @see #setMaxFieldLength - */ - public int getMaxFieldLength() { - return maxFieldLength; - } - - /** Determines the minimal number of documents required before the buffered - * in-memory documents are merging and a new Segment is created. - * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, - * large value gives faster indexing. At the same time, mergeFactor limits - * the number of files open in a FSDirectory. - * - *The default value is 10. - */ - public void setMaxBufferedDocs(int maxBufferedDocs) { - this.minMergeDocs = maxBufferedDocs; - } - - /** - * @see #setMaxBufferedDocs - */ - public int getMaxBufferedDocs() { - return minMergeDocs; - } - - /** Determines how often segment indices are merged by addDocument(). With - * smaller values, less RAM is used while indexing, and searches on - * unoptimized indices are faster, but indexing speed is slower. With larger - * values, more RAM is used during indexing, and while searches on unoptimized - * indices are slower, indexing is faster. Thus larger values (> 10) are best - * for batch index creation, and smaller values (< 10) for indices that are - * interactively maintained. - * - *
This must never be less than 2. The default value is 10. - */ - public void setMergeFactor(int mergeFactor) { - if (mergeFactor < 2) - throw new IllegalArgumentException("mergeFactor cannot be less than 2"); - this.mergeFactor = mergeFactor; - } - - /** - * @see #setMergeFactor - */ - public int getMergeFactor() { - return mergeFactor; - } - - /** If non-null, information about merges and a message when - * maxFieldLength is reached will be printed to this. - */ - public void setInfoStream(PrintStream infoStream) { - this.infoStream = infoStream; - } - - /** - * @see #setInfoStream - */ - public PrintStream getInfoStream() { - return infoStream; - } - - /** Flushes all changes to an index and closes all associated files. */ - public synchronized void close() throws IOException { - flushRamSegments(); - ramDirectory.close(); - if (writeLock != null) { - writeLock.release(); // release write lock - writeLock = null; - } - if(closeDir) - directory.close(); - } - - /** Release the write lock, if needed. */ - protected void finalize() throws IOException { - if (writeLock != null) { - writeLock.release(); // release write lock - writeLock = null; - } - } - - /** Returns the analyzer used by this index. */ - public Analyzer getAnalyzer() { - return analyzer; - } - - - /** Returns the number of documents currently in this index. */ - public synchronized int docCount() { - int count = 0; - for (int i = 0; i < segmentInfos.size(); i++) { - SegmentInfo si = segmentInfos.info(i); - count += si.docCount; - } - return count; - } - - /** - * The maximum number of terms that will be indexed for a single field in a - * document. This limits the amount of memory required for indexing, so that - * collections with very large files will not crash the indexing process by - * running out of memory.
- * Note that this effectively truncates large documents, excluding from the - * index terms that occur further in the document. If you know your source - * documents are large, be sure to set this value high enough to accomodate - * the expected size. If you set it to Integer.MAX_VALUE, then the only limit - * is your memory, but you should anticipate an OutOfMemoryError. - * By default, no more than 10,000 terms will be indexed for a field. - * - * @deprecated use {@link #setMaxFieldLength} instead - */ - public int maxFieldLength = DEFAULT_MAX_FIELD_LENGTH; - - /** - * Adds a document to this index. If the document contains more than - * {@link #maxFieldLength} terms for a given field, the remainder are - * discarded. - */ - public void addDocument(Document doc) throws IOException { - addDocument(doc, analyzer); - } - - /** - * Adds a document to this index, using the provided analyzer instead of the - * value of {@link #getAnalyzer()}. If the document contains more than - * {@link #maxFieldLength} terms for a given field, the remainder are - * discarded. - */ - public void addDocument(Document doc, Analyzer analyzer) throws IOException { - DocumentWriter dw = - new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength); - dw.setInfoStream(infoStream); - String segmentName = newSegmentName(); - dw.addDocument(segmentName, doc); - synchronized (this) { - segmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory)); - maybeMergeSegments(); - } - } - - final int getSegmentsCounter(){ - return segmentInfos.counter; - } - - private final synchronized String newSegmentName() { - return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX); - } - - /** Determines how often segment indices are merged by addDocument(). With - * smaller values, less RAM is used while indexing, and searches on - * unoptimized indices are faster, but indexing speed is slower. With larger - * values, more RAM is used during indexing, and while searches on unoptimized - * indices are slower, indexing is faster. Thus larger values (> 10) are best - * for batch index creation, and smaller values (< 10) for indices that are - * interactively maintained. - * - *This must never be less than 2. The default value is 10. - * @deprecated use {@link #setMergeFactor} instead - */ - public int mergeFactor = DEFAULT_MERGE_FACTOR; - - /** Determines the minimal number of documents required before the buffered - * in-memory documents are merging and a new Segment is created. - * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, - * large value gives faster indexing. At the same time, mergeFactor limits - * the number of files open in a FSDirectory. - * - *
The default value is 10. - * @deprecated use {@link #setMaxBufferedDocs} instead - */ - public int minMergeDocs = DEFAULT_MIN_MERGE_DOCS; - - - /** Determines the largest number of documents ever merged by addDocument(). - * Small values (e.g., less than 10,000) are best for interactive indexing, - * as this limits the length of pauses while indexing to a few seconds. - * Larger values are best for batched indexing and speedier searches. - * - *
The default value is {@link Integer#MAX_VALUE}. - * @deprecated use {@link #setMaxMergeDocs} instead - */ - public int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; - - /** If non-null, information about merges will be printed to this. - * @deprecated use {@link #setInfoStream} instead - */ - public PrintStream infoStream = null; - - /** Merges all segments together into a single segment, optimizing an index - for search. */ - public synchronized void optimize() throws IOException { - flushRamSegments(); - while (segmentInfos.size() > 1 || - (segmentInfos.size() == 1 && - (SegmentReader.hasDeletions(segmentInfos.info(0)) || - segmentInfos.info(0).dir != directory || - (useCompoundFile && - (!SegmentReader.usesCompoundFile(segmentInfos.info(0)) || - SegmentReader.hasSeparateNorms(segmentInfos.info(0))))))) { - int minSegment = segmentInfos.size() - mergeFactor; - mergeSegments(minSegment < 0 ? 0 : minSegment); - } - } - - /** Merges all segments from an array of indexes into this index. - * - *
This may be used to parallelize batch indexing. A large document - * collection can be broken into sub-collections. Each sub-collection can be - * indexed in parallel, on a different thread, process or machine. The - * complete index can then be created by merging sub-collection indexes - * with this method. - * - *
After this completes, the index is optimized. */ - public synchronized void addIndexes(Directory[] dirs) - throws IOException { - optimize(); // start with zero or 1 seg - for (int i = 0; i < dirs.length; i++) { - SegmentInfos sis = new SegmentInfos(); // read infos from dir - sis.read(dirs[i]); - for (int j = 0; j < sis.size(); j++) { - segmentInfos.addElement(sis.info(j)); // add each info - } - } - optimize(); // final cleanup - } - - /** Merges the provided indexes into this index. - *
After this completes, the index is optimized.
- *The provided IndexReaders are not closed.
- */ - public synchronized void addIndexes(IndexReader[] readers) - throws IOException { - - optimize(); // start with zero or 1 seg - - final String mergedName = newSegmentName(); - SegmentMerger merger = new SegmentMerger(directory, mergedName); - - final Vector segmentsToDelete = new Vector(); - IndexReader sReader = null; - if (segmentInfos.size() == 1){ // add existing index, if any - sReader = SegmentReader.get(segmentInfos.info(0)); - merger.add(sReader); - segmentsToDelete.addElement(sReader); // queue segment for deletion - } - - for (int i = 0; i < readers.length; i++) // add new indexes - merger.add(readers[i]); - - int docCount = merger.merge(); // merge 'em - - segmentInfos.setSize(0); // pop old infos & add new - segmentInfos.addElement(new SegmentInfo(mergedName, docCount, directory)); - - if(sReader != null) - sReader.close(); - - synchronized (directory) { // in- & inter-process sync - new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - segmentInfos.write(directory); // commit changes - deleteSegments(segmentsToDelete); // delete now-unused segments - return null; - } - }.run(); - } - - if (useCompoundFile) { - final Vector filesToDelete = merger.createCompoundFile(mergedName + ".tmp"); - synchronized (directory) { // in- & inter-process sync - new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - // make compound file visible for SegmentReaders - directory.renameFile(mergedName + ".tmp", mergedName + ".cfs"); - // delete now unused files of segment - deleteFiles(filesToDelete); - return null; - } - }.run(); - } - } - } - - /** Merges all RAM-resident segments. */ - private final void flushRamSegments() throws IOException { - int minSegment = segmentInfos.size()-1; - int docCount = 0; - while (minSegment >= 0 && - (segmentInfos.info(minSegment)).dir == ramDirectory) { - docCount += segmentInfos.info(minSegment).docCount; - minSegment--; - } - if (minSegment < 0 || // add one FS segment? - (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor || - !(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory)) - minSegment++; - if (minSegment >= segmentInfos.size()) - return; // none to merge - mergeSegments(minSegment); - } - - /** Incremental segment merger. */ - private final void maybeMergeSegments() throws IOException { - long targetMergeDocs = minMergeDocs; - while (targetMergeDocs <= maxMergeDocs) { - // find segments smaller than current target size - int minSegment = segmentInfos.size(); - int mergeDocs = 0; - while (--minSegment >= 0) { - SegmentInfo si = segmentInfos.info(minSegment); - if (si.docCount >= targetMergeDocs) - break; - mergeDocs += si.docCount; - } - - if (mergeDocs >= targetMergeDocs) // found a merge to do - mergeSegments(minSegment+1); - else - break; - - targetMergeDocs *= mergeFactor; // increase target size - } - } - - /** Pops segments off of segmentInfos stack down to minSegment, merges them, - and pushes the merged index onto the top of the segmentInfos stack. */ - private final void mergeSegments(int minSegment) - throws IOException { - final String mergedName = newSegmentName(); - if (infoStream != null) infoStream.print("merging segments"); - SegmentMerger merger = - new SegmentMerger(directory, mergedName); - - final Vector segmentsToDelete = new Vector(); - for (int i = minSegment; i < segmentInfos.size(); i++) { - SegmentInfo si = segmentInfos.info(i); - if (infoStream != null) - infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); - IndexReader reader = SegmentReader.get(si); - merger.add(reader); - if ((reader.directory() == this.directory) || // if we own the directory - (reader.directory() == this.ramDirectory)) - segmentsToDelete.addElement(reader); // queue segment for deletion - } - - int mergedDocCount = merger.merge(); - - if (infoStream != null) { - infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)"); - } - - segmentInfos.setSize(minSegment); // pop old infos & add new - segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount, - directory)); - - // close readers before we attempt to delete now-obsolete segments - merger.closeReaders(); - - synchronized (directory) { // in- & inter-process sync - new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - segmentInfos.write(directory); // commit before deleting - deleteSegments(segmentsToDelete); // delete now-unused segments - return null; - } - }.run(); - } - - if (useCompoundFile) { - final Vector filesToDelete = merger.createCompoundFile(mergedName + ".tmp"); - synchronized (directory) { // in- & inter-process sync - new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - // make compound file visible for SegmentReaders - directory.renameFile(mergedName + ".tmp", mergedName + ".cfs"); - // delete now unused files of segment - deleteFiles(filesToDelete); - return null; - } - }.run(); - } - } - } - - /* - * Some operating systems (e.g. Windows) don't permit a file to be deleted - * while it is opened for read (e.g. by another process or thread). So we - * assume that when a delete fails it is because the file is open in another - * process, and queue the file for subsequent deletion. - */ - - private final void deleteSegments(Vector segments) throws IOException { - Vector deletable = new Vector(); - - deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable - - for (int i = 0; i < segments.size(); i++) { - SegmentReader reader = (SegmentReader)segments.elementAt(i); - if (reader.directory() == this.directory) - deleteFiles(reader.files(), deletable); // try to delete our files - else - deleteFiles(reader.files(), reader.directory()); // delete other files - } - - writeDeleteableFiles(deletable); // note files we can't delete - } - - private final void deleteFiles(Vector files) throws IOException { - Vector deletable = new Vector(); - deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable - deleteFiles(files, deletable); // try to delete our files - writeDeleteableFiles(deletable); // note files we can't delete - } - - private final void deleteFiles(Vector files, Directory directory) - throws IOException { - for (int i = 0; i < files.size(); i++) - directory.deleteFile((String)files.elementAt(i)); - } - - private final void deleteFiles(Vector files, Vector deletable) - throws IOException { - for (int i = 0; i < files.size(); i++) { - String file = (String)files.elementAt(i); - try { - directory.deleteFile(file); // try to delete each file - } catch (IOException e) { // if delete fails - if (directory.fileExists(file)) { - if (infoStream != null) - infoStream.println(e.toString() + "; Will re-try later."); - deletable.addElement(file); // add to deletable - } - } - } - } - - private final Vector readDeleteableFiles() throws IOException { - Vector result = new Vector(); - if (!directory.fileExists("deletable")) - return result; - - IndexInput input = directory.openInput("deletable"); - try { - for (int i = input.readInt(); i > 0; i--) // read file names - result.addElement(input.readString()); - } finally { - input.close(); - } - return result; - } - - private final void writeDeleteableFiles(Vector files) throws IOException { - IndexOutput output = directory.createOutput("deleteable.new"); - try { - output.writeInt(files.size()); - for (int i = 0; i < files.size(); i++) - output.writeString((String)files.elementAt(i)); - } finally { - output.close(); - } - directory.renameFile("deleteable.new", "deletable"); - } + /** + * Default value is 1000. Use + *org.apache.lucene.writeLockTimeout system property to
+ * override.
+ */
+ public static long WRITE_LOCK_TIMEOUT = Integer.parseInt(System
+ .getProperty("org.apache.lucene.writeLockTimeout", "1000"));
+
+ /**
+ * Default value is 10000. Use
+ * org.apache.lucene.commitLockTimeout system property to
+ * override.
+ */
+ public static long COMMIT_LOCK_TIMEOUT = Integer.parseInt(System
+ .getProperty("org.apache.lucene.commitLockTimeout", "10000"));
+
+ public static final String WRITE_LOCK_NAME = "write.lock";
+
+ public static final String COMMIT_LOCK_NAME = "commit.lock";
+
+ /**
+ * Default value is 10. Use org.apache.lucene.mergeFactor
+ * system property to override.
+ */
+ public static final int DEFAULT_MERGE_FACTOR = Integer.parseInt(System
+ .getProperty("org.apache.lucene.mergeFactor", "10"));
+
+ /**
+ * Default value is 10. Use org.apache.lucene.minMergeDocs
+ * system property to override.
+ */
+ public static final int DEFAULT_MIN_MERGE_DOCS = Integer.parseInt(System
+ .getProperty("org.apache.lucene.minMergeDocs", "10"));
+
+ /**
+ * Default value is {@link Integer#MAX_VALUE}. Use
+ * org.apache.lucene.maxMergeDocs system property to
+ * override.
+ */
+ public static final int DEFAULT_MAX_MERGE_DOCS = Integer.parseInt(System
+ .getProperty("org.apache.lucene.maxMergeDocs", String
+ .valueOf(Integer.MAX_VALUE)));
+
+ /**
+ * Default value is 10000. Use org.apache.lucene.maxFieldLength
+ * system property to override.
+ */
+ public static final int DEFAULT_MAX_FIELD_LENGTH = Integer.parseInt(System
+ .getProperty("org.apache.lucene.maxFieldLength", "10000"));
+
+ private Directory directory; // where this index resides
+
+ private Analyzer analyzer; // how to analyze text
+
+ private Similarity similarity = Similarity.getDefault(); // how to normalize
+
+ private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
+
+ private final Directory ramDirectory = new RAMDirectory(); // for temp segs
+
+ private Lock writeLock;
+
+ /**
+ * Use compound file setting. Defaults to true, minimizing the number of
+ * files used. Setting this to false may improve indexing performance, but
+ * may also cause file handle problems.
+ */
+ private boolean useCompoundFile = true;
+
+ private boolean closeDir;
+
+ /**
+ * Get the current setting of whether to use the compound file format. Note
+ * that this just returns the value you set with setUseCompoundFile(boolean)
+ * or the default. You cannot use this to query the status of an existing
+ * index.
+ *
+ * @see #setUseCompoundFile(boolean)
+ */
+ public boolean getUseCompoundFile() {
+ return useCompoundFile;
+ }
+
+ /**
+ * Setting to turn on usage of a compound file. When on, multiple files for
+ * each segment are merged into a single file once the segment creation is
+ * finished. This is done regardless of what directory is in use.
+ */
+ public void setUseCompoundFile(boolean value) {
+ useCompoundFile = value;
+ }
+
+ /**
+ * Expert: Set the Similarity implementation used by this IndexWriter.
+ *
+ * @see Similarity#setDefault(Similarity)
+ */
+ public void setSimilarity(Similarity similarity) {
+ this.similarity = similarity;
+ }
+
+ /**
+ * Expert: Return the Similarity implementation used by this IndexWriter.
+ *
+ *
+ * This defaults to the current value of {@link Similarity#getDefault()}.
+ */
+ public Similarity getSimilarity() {
+ return this.similarity;
+ }
+
+ /**
+ * Constructs an IndexWriter for the index in path. Text
+ * will be analyzed with a. If create is
+ * true, then a new, empty index will be created in path,
+ * replacing the index already there, if any.
+ *
+ * @param path
+ * the path to the index directory
+ * @param a
+ * the analyzer to use
+ * @param create
+ * true to create the index or overwrite the
+ * existing one; false to append to the existing
+ * index
+ * @throws IOException
+ * if the directory cannot be read/written to, or if it does not
+ * exist, and create is false
+ */
+ public IndexWriter(String path, Analyzer a, boolean create)
+ throws IOException {
+ this(FSDirectory.getDirectory(path, create), a, create, true);
+ }
+
+ /**
+ * Constructs an IndexWriter for the index in path. Text
+ * will be analyzed with a. If create is
+ * true, then a new, empty index will be created in path,
+ * replacing the index already there, if any.
+ *
+ * @param path
+ * the path to the index directory
+ * @param a
+ * the analyzer to use
+ * @param create
+ * true to create the index or overwrite the
+ * existing one; false to append to the existing
+ * index
+ * @throws IOException
+ * if the directory cannot be read/written to, or if it does not
+ * exist, and create is false
+ */
+ public IndexWriter(File path, Analyzer a, boolean create)
+ throws IOException {
+ this(FSDirectory.getDirectory(path, create), a, create, true);
+ }
+
+ /**
+ * Constructs an IndexWriter for the index in d. Text will
+ * be analyzed with a. If create is true,
+ * then a new, empty index will be created in d, replacing
+ * the index already there, if any.
+ *
+ * @param d
+ * the index directory
+ * @param a
+ * the analyzer to use
+ * @param create
+ * true to create the index or overwrite the
+ * existing one; false to append to the existing
+ * index
+ * @throws IOException
+ * if the directory cannot be read/written to, or if it does not
+ * exist, and create is false
+ */
+ public IndexWriter(Directory d, Analyzer a, boolean create)
+ throws IOException {
+ this(d, a, create, false);
+ }
+
+ private IndexWriter(Directory d, Analyzer a, final boolean create,
+ boolean closeDir) throws IOException {
+ this.closeDir = closeDir;
+ directory = d;
+ analyzer = a;
+
+ Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME);
+ if (!writeLock.obtain(WRITE_LOCK_TIMEOUT)) // obtain write lock
+ throw new IOException("Index locked for write: " + writeLock);
+ this.writeLock = writeLock; // save it
+
+ synchronized (directory) { // in- & inter-process sync
+ new Lock.With(directory.makeLock(IndexWriter.COMMIT_LOCK_NAME),
+ COMMIT_LOCK_TIMEOUT) {
+ public Object doBody() throws IOException {
+ if (create)
+ segmentInfos.write(directory);
+ else
+ segmentInfos.read(directory);
+ return null;
+ }
+ }.run();
+ }
+ }
+
+ /**
+ * Determines the largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ *
+ * The default value is {@link Integer#MAX_VALUE}. + */ + public void setMaxMergeDocs(int maxMergeDocs) { + this.maxMergeDocs = maxMergeDocs; + } + + /** + * @see #setMaxMergeDocs + */ + public int getMaxMergeDocs() { + return maxMergeDocs; + } + + /** + * The maximum number of terms that will be indexed for a single field in a + * document. This limits the amount of memory required for indexing, so that + * collections with very large files will not crash the indexing process by + * running out of memory.
Note that this effectively truncates large + * documents, excluding from the index terms that occur further in the + * document. If you know your source documents are large, be sure to set + * this value high enough to accomodate the expected size. If you set it to + * Integer.MAX_VALUE, then the only limit is your memory, but you should + * anticipate an OutOfMemoryError. By default, no more than 10,000 + * terms will be indexed for a field. + */ + public void setMaxFieldLength(int maxFieldLength) { + this.maxFieldLength = maxFieldLength; + } + + /** + * @see #setMaxFieldLength + */ + public int getMaxFieldLength() { + return maxFieldLength; + } + + /** + * Determines the minimal number of documents required before the buffered + * in-memory documents are merging and a new Segment is created. Since + * Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, + * large value gives faster indexing. At the same time, mergeFactor limits + * the number of files open in a FSDirectory. + * + *+ * The default value is 10. + */ + public void setMaxBufferedDocs(int maxBufferedDocs) { + this.minMergeDocs = maxBufferedDocs; + } + + /** + * @see #setMaxBufferedDocs + */ + public int getMaxBufferedDocs() { + return minMergeDocs; + } + + /** + * Determines how often segment indices are merged by addDocument(). With + * smaller values, less RAM is used while indexing, and searches on + * unoptimized indices are faster, but indexing speed is slower. With larger + * values, more RAM is used during indexing, and while searches on + * unoptimized indices are slower, indexing is faster. Thus larger values (> + * 10) are best for batch index creation, and smaller values ( < 10) for + * indices that are interactively maintained. + * + *
+ * This must never be less than 2. The default value is 10. + */ + public void setMergeFactor(int mergeFactor) { + if (mergeFactor < 2) + throw new IllegalArgumentException( + "mergeFactor cannot be less than 2"); + this.mergeFactor = mergeFactor; + } + + /** + * @see #setMergeFactor + */ + public int getMergeFactor() { + return mergeFactor; + } + + /** + * If non-null, information about merges and a message when maxFieldLength + * is reached will be printed to this. + */ + public void setInfoStream(PrintStream infoStream) { + this.infoStream = infoStream; + } + + /** + * @see #setInfoStream + */ + public PrintStream getInfoStream() { + return infoStream; + } + + /** Flushes all changes to an index and closes all associated files. */ + public synchronized void close() throws IOException { + flushRamSegments(); + ramDirectory.close(); + if (writeLock != null) { + writeLock.release(); // release write lock + writeLock = null; + } + if (closeDir) + directory.close(); + } + + /** Release the write lock, if needed. */ + protected void finalize() throws IOException { + if (writeLock != null) { + writeLock.release(); // release write lock + writeLock = null; + } + } + + /** Returns the analyzer used by this index. */ + public Analyzer getAnalyzer() { + return analyzer; + } + + /** Returns the number of documents currently in this index. */ + public synchronized int docCount() { + int count = 0; + for (int i = 0; i < segmentInfos.size(); i++) { + SegmentInfo si = segmentInfos.info(i); + count += si.docCount; + } + return count; + } + + /** + * The maximum number of terms that will be indexed for a single field in a + * document. This limits the amount of memory required for indexing, so that + * collections with very large files will not crash the indexing process by + * running out of memory.
Note that this effectively truncates large + * documents, excluding from the index terms that occur further in the + * document. If you know your source documents are large, be sure to set + * this value high enough to accomodate the expected size. If you set it to + * Integer.MAX_VALUE, then the only limit is your memory, but you should + * anticipate an OutOfMemoryError. By default, no more than 10,000 + * terms will be indexed for a field. + * + * @deprecated use {@link #setMaxFieldLength}instead + */ + public int maxFieldLength = DEFAULT_MAX_FIELD_LENGTH; + + /** + * Adds a document to this index. If the document contains more than + * {@link #maxFieldLength}terms for a given field, the remainder are + * discarded. + */ + public void addDocument(Document doc) throws IOException { + addDocument(doc, analyzer); + } + + /** + * Adds a document to this index, using the provided analyzer instead of the + * value of {@link #getAnalyzer()}. If the document contains more than + * {@link #maxFieldLength}terms for a given field, the remainder are + * discarded. + */ + public void addDocument(Document doc, Analyzer analyzer) throws IOException { + DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, + similarity, maxFieldLength); + dw.setInfoStream(infoStream); + String segmentName = newSegmentName(); + dw.addDocument(segmentName, doc); + synchronized (this) { + segmentInfos.addElement(new SegmentInfo(segmentName, 1, + ramDirectory)); + maybeMergeSegments(); + } + } + + final int getSegmentsCounter() { + return segmentInfos.counter; + } + + private final synchronized String newSegmentName() { + return "_" + + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX); + } + + /** + * Determines how often segment indices are merged by addDocument(). With + * smaller values, less RAM is used while indexing, and searches on + * unoptimized indices are faster, but indexing speed is slower. With larger + * values, more RAM is used during indexing, and while searches on + * unoptimized indices are slower, indexing is faster. Thus larger values (> + * 10) are best for batch index creation, and smaller values ( < 10) for + * indices that are interactively maintained. + * + *+ * This must never be less than 2. The default value is 10. + * + * @deprecated use {@link #setMergeFactor}instead + */ + public int mergeFactor = DEFAULT_MERGE_FACTOR; + + /** + * Determines the minimal number of documents required before the buffered + * in-memory documents are merging and a new Segment is created. Since + * Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, + * large value gives faster indexing. At the same time, mergeFactor limits + * the number of files open in a FSDirectory. + * + *
+ * The default value is 10. + * + * @deprecated use {@link #setMaxBufferedDocs}instead + */ + public int minMergeDocs = DEFAULT_MIN_MERGE_DOCS; + + /** + * Determines the largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + *
+ * The default value is {@link Integer#MAX_VALUE}. + * + * @deprecated use {@link #setMaxMergeDocs}instead + */ + public int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; + + /** + * If non-null, information about merges will be printed to this. + * + * @deprecated use {@link #setInfoStream}instead + */ + public PrintStream infoStream = null; + + /** + * Merges all segments together into a single segment, optimizing an index + * for search. + */ + public synchronized void optimize() throws IOException { + flushRamSegments(); + while (segmentInfos.size() > 1 + || (segmentInfos.size() == 1 && (SegmentReader + .hasDeletions(segmentInfos.info(0)) + || segmentInfos.info(0).dir != directory || (useCompoundFile && (!SegmentReader + .usesCompoundFile(segmentInfos.info(0)) || SegmentReader + .hasSeparateNorms(segmentInfos.info(0))))))) { + int minSegment = segmentInfos.size() - mergeFactor; + mergeSegments(minSegment < 0 ? 0 : minSegment); + } + } + + /** + * Merges all segments from an array of indexes into this index. + * + *
+ * This may be used to parallelize batch indexing. A large document + * collection can be broken into sub-collections. Each sub-collection can be + * indexed in parallel, on a different thread, process or machine. The + * complete index can then be created by merging sub-collection indexes with + * this method. + * + *
+ * After this completes, the index is optimized. + */ + public synchronized void addIndexes(Directory[] dirs) throws IOException { + optimize(); // start with zero or 1 seg + for (int i = 0; i < dirs.length; i++) { + SegmentInfos sis = new SegmentInfos(); // read infos from dir + sis.read(dirs[i]); + for (int j = 0; j < sis.size(); j++) { + segmentInfos.addElement(sis.info(j)); // add each info + } + } + optimize(); // final cleanup + } + + /** + * Merges the provided indexes into this index. + *
+ * After this completes, the index is optimized. + *
+ *+ * The provided IndexReaders are not closed. + *
+ */ + public synchronized void addIndexes(IndexReader[] readers) + throws IOException { + + optimize(); // start with zero or 1 seg + + final String mergedName = newSegmentName(); + SegmentMerger merger = new SegmentMerger(directory, mergedName); + + final Vector segmentsToDelete = new Vector(); + IndexReader sReader = null; + if (segmentInfos.size() == 1) { // add existing index, if any + sReader = SegmentReader.get(segmentInfos.info(0)); + merger.add(sReader); + segmentsToDelete.addElement(sReader); // queue segment for deletion + } + + for (int i = 0; i < readers.length; i++) + // add new indexes + merger.add(readers[i]); + + int docCount = merger.merge(); // merge 'em + + segmentInfos.setSize(0); // pop old infos & add new + segmentInfos + .addElement(new SegmentInfo(mergedName, docCount, directory)); + + if (sReader != null) + sReader.close(); + + synchronized (directory) { // in- & inter-process sync + new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), + COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + segmentInfos.write(directory); // commit changes + deleteSegments(segmentsToDelete); // delete now-unused + // segments + return null; + } + }.run(); + } + + if (useCompoundFile) { + final Vector filesToDelete = merger.createCompoundFile(mergedName + + ".tmp"); + synchronized (directory) { // in- & inter-process sync + new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), + COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + // make compound file visible for SegmentReaders + directory.renameFile(mergedName + ".tmp", mergedName + + ".cfs"); + // delete now unused files of segment + deleteFiles(filesToDelete); + return null; + } + }.run(); + } + } + } + + /** Merges all RAM-resident segments. */ + private final void flushRamSegments() throws IOException { + int minSegment = segmentInfos.size() - 1; + int docCount = 0; + while (minSegment >= 0 + && (segmentInfos.info(minSegment)).dir == ramDirectory) { + docCount += segmentInfos.info(minSegment).docCount; + minSegment--; + } + if (minSegment < 0 + || // add one FS segment? + (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor + || !(segmentInfos.info(segmentInfos.size() - 1).dir == ramDirectory)) + minSegment++; + if (minSegment >= segmentInfos.size()) + return; // none to merge + mergeSegments(minSegment); + } + + /** Incremental segment merger. */ + private final void maybeMergeSegments() throws IOException { + long targetMergeDocs = minMergeDocs; + while (targetMergeDocs <= maxMergeDocs) { + // find segments smaller than current target size + int minSegment = segmentInfos.size(); + int mergeDocs = 0; + while (--minSegment >= 0) { + SegmentInfo si = segmentInfos.info(minSegment); + if (si.docCount >= targetMergeDocs) + break; + mergeDocs += si.docCount; + } + + if (mergeDocs >= targetMergeDocs) // found a merge to do + mergeSegments(minSegment + 1); + else + break; + + targetMergeDocs *= mergeFactor; // increase target size + } + } + + /** + * Pops segments off of segmentInfos stack down to minSegment, merges them, + * and pushes the merged index onto the top of the segmentInfos stack. + */ + private final void mergeSegments(int minSegment) throws IOException { + final String mergedName = newSegmentName(); + if (infoStream != null) + infoStream.print("merging segments"); + SegmentMerger merger = new SegmentMerger(directory, mergedName); + + final Vector segmentsToDelete = new Vector(); + for (int i = minSegment; i < segmentInfos.size(); i++) { + SegmentInfo si = segmentInfos.info(i); + if (infoStream != null) + infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); + IndexReader reader = SegmentReader.get(si); + merger.add(reader); + if ((reader.directory() == this.directory) || // if we own the + // directory + (reader.directory() == this.ramDirectory)) + segmentsToDelete.addElement(reader); // queue segment for + // deletion + } + + int mergedDocCount = merger.merge(); + + if (infoStream != null) { + infoStream.println(" into " + mergedName + " (" + mergedDocCount + + " docs)"); + } + + segmentInfos.setSize(minSegment); // pop old infos & add new + segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount, + directory)); + + // close readers before we attempt to delete now-obsolete segments + merger.closeReaders(); + + synchronized (directory) { // in- & inter-process sync + new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), + COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + segmentInfos.write(directory); // commit before deleting + deleteSegments(segmentsToDelete); // delete now-unused + // segments + return null; + } + }.run(); + } + + if (useCompoundFile) { + final Vector filesToDelete = merger.createCompoundFile(mergedName + + ".tmp"); + synchronized (directory) { // in- & inter-process sync + new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), + COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + // make compound file visible for SegmentReaders + directory.renameFile(mergedName + ".tmp", mergedName + + ".cfs"); + // delete now unused files of segment + deleteFiles(filesToDelete); + return null; + } + }.run(); + } + } + } + + /** + * Expunges the deleted documents. + * Compacts and merges all segments containing deleted documents. + * @throws IOException + */ + public final void expungeDeleted() throws IOException { + final Vector segmentsToDelete = new Vector(); + final SegmentInfos newInfos = new SegmentInfos(); + + if (infoStream != null) { + infoStream.print("Expunging deleted documents."); + } + + // iterate thru all the segment infos + for (int i = 0; i < segmentInfos.size(); i++) { + SegmentInfo si = segmentInfos.info(i); + + Directory directory = si.dir; + + if (SegmentReader.hasDeletions(si)) { // found a deletion + + // make a new segment and merge itself + String newSegment = newSegmentName(); + + SegmentReader reader = SegmentReader.get(si); + SegmentMerger merger = new SegmentMerger(directory, newSegment); + merger.add(reader); + int newDocCount = merger.merge(); + merger.closeReaders(); + + // do the compound file thing + if (useCompoundFile) { + final Vector filesToDelete = merger + .createCompoundFile(newSegment + ".tmp"); + synchronized (directory) { // in- & inter-process sync + + boolean locked = false; + Lock lock = directory.makeLock(COMMIT_LOCK_NAME); + try { + locked = lock.obtain(COMMIT_LOCK_TIMEOUT); + // make compound file visible for SegmentReaders + directory.renameFile(newSegment + ".tmp", + newSegment + ".cfs"); + // delete now unused files of segment + deleteFiles(filesToDelete); + } finally { + if (locked) + lock.release(); + } + } + } + + if ((reader.directory() == this.directory) || // if we own the + // directory + (reader.directory() == this.ramDirectory)) { + segmentsToDelete.add(reader); + } + + newInfos + .add(new SegmentInfo(newSegment, newDocCount, directory)); + } else { + newInfos.add(si); + } + } + + // rewrite the segment info if there is something changed + if (segmentsToDelete.size() > 0) { + synchronized (directory) { // in- & inter-process sync + new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), + COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + newInfos.write(directory); // commit before deleting + deleteSegments(segmentsToDelete); // delete now-unused + // segments + return null; + } + }.run(); + } + } + } + + /* + * Some operating systems (e.g. Windows) don't permit a file to be deleted + * while it is opened for read (e.g. by another process or thread). So we + * assume that when a delete fails it is because the file is open in another + * process, and queue the file for subsequent deletion. + */ + + private final void deleteSegments(Vector segments) throws IOException { + Vector deletable = new Vector(); + + deleteFiles(readDeleteableFiles(), deletable); // try to delete + // deleteable + + for (int i = 0; i < segments.size(); i++) { + SegmentReader reader = (SegmentReader) segments.elementAt(i); + if (reader.directory() == this.directory) + deleteFiles(reader.files(), deletable); // try to delete our + // files + else + deleteFiles(reader.files(), reader.directory()); // delete other + // files + } + + writeDeleteableFiles(deletable); // note files we can't delete + } + + private final void deleteFiles(Vector files) throws IOException { + Vector deletable = new Vector(); + deleteFiles(readDeleteableFiles(), deletable); // try to delete + // deleteable + deleteFiles(files, deletable); // try to delete our files + writeDeleteableFiles(deletable); // note files we can't delete + } + + private final void deleteFiles(Vector files, Directory directory) + throws IOException { + for (int i = 0; i < files.size(); i++) + directory.deleteFile((String) files.elementAt(i)); + } + + private final void deleteFiles(Vector files, Vector deletable) + throws IOException { + for (int i = 0; i < files.size(); i++) { + String file = (String) files.elementAt(i); + try { + directory.deleteFile(file); // try to delete each file + } catch (IOException e) { // if delete fails + if (directory.fileExists(file)) { + if (infoStream != null) + infoStream.println(e.toString() + + "; Will re-try later."); + deletable.addElement(file); // add to deletable + } + } + } + } + + private final Vector readDeleteableFiles() throws IOException { + Vector result = new Vector(); + if (!directory.fileExists("deletable")) + return result; + + IndexInput input = directory.openInput("deletable"); + try { + for (int i = input.readInt(); i > 0; i--) + // read file names + result.addElement(input.readString()); + } finally { + input.close(); + } + return result; + } + + private final void writeDeleteableFiles(Vector files) throws IOException { + IndexOutput output = directory.createOutput("deleteable.new"); + try { + output.writeInt(files.size()); + for (int i = 0; i < files.size(); i++) + output.writeString((String) files.elementAt(i)); + } finally { + output.close(); + } + directory.renameFile("deleteable.new", "deletable"); + } } Index: IndexWriter.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/IndexWriter.java,v retrieving revision 1.44 diff -u -r1.44 IndexWriter.java --- IndexWriter.java 12 Dec 2004 20:26:27 -0000 1.44 +++ IndexWriter.java 14 Dec 2004 23:36:31 -0000 @@ -31,709 +31,860 @@ import org.apache.lucene.document.Document; import org.apache.lucene.analysis.Analyzer; - /** - An IndexWriter creates and maintains an index. - - The third argument to the - constructor - determines whether a new index is created, or whether an existing index is - opened for the addition of new documents. - - In either case, documents are added with the addDocument method. - When finished adding documents, close should be called. - - If an index will not have more documents added for a while and optimal search - performance is desired, then the optimize - method should be called before the index is closed. - */ + * An IndexWriter creates and maintains an index. + * + * The third argument to the constructor + * determines whether a new index is created, or whether an existing index is + * opened for the addition of new documents. + * + * In either case, documents are added with the addDocument + * method. When finished adding documents, close + * should be called. + * + * If an index will not have more documents added for a while and optimal search + * performance is desired, then the optimize + * method should be called before the index is closed. + */ public class IndexWriter { - /** - * Default value is 1000. Useorg.apache.lucene.writeLockTimeout
- * system property to override.
- */
- public static long WRITE_LOCK_TIMEOUT =
- Integer.parseInt(System.getProperty("org.apache.lucene.writeLockTimeout",
- "1000"));
-
- /**
- * Default value is 10000. Use org.apache.lucene.commitLockTimeout
- * system property to override.
- */
- public static long COMMIT_LOCK_TIMEOUT =
- Integer.parseInt(System.getProperty("org.apache.lucene.commitLockTimeout",
- "10000"));
-
- public static final String WRITE_LOCK_NAME = "write.lock";
- public static final String COMMIT_LOCK_NAME = "commit.lock";
-
- /**
- * Default value is 10. Use org.apache.lucene.mergeFactor
- * system property to override.
- */
- public static final int DEFAULT_MERGE_FACTOR =
- Integer.parseInt(System.getProperty("org.apache.lucene.mergeFactor",
- "10"));
-
- /**
- * Default value is 10. Use org.apache.lucene.minMergeDocs
- * system property to override.
- */
- public static final int DEFAULT_MIN_MERGE_DOCS =
- Integer.parseInt(System.getProperty("org.apache.lucene.minMergeDocs",
- "10"));
-
- /**
- * Default value is {@link Integer#MAX_VALUE}.
- * Use org.apache.lucene.maxMergeDocs system property to override.
- */
- public static final int DEFAULT_MAX_MERGE_DOCS =
- Integer.parseInt(System.getProperty("org.apache.lucene.maxMergeDocs",
- String.valueOf(Integer.MAX_VALUE)));
-
- /**
- * Default value is 10000. Use org.apache.lucene.maxFieldLength
- * system property to override.
- */
- public static final int DEFAULT_MAX_FIELD_LENGTH =
- Integer.parseInt(System.getProperty("org.apache.lucene.maxFieldLength",
- "10000"));
-
-
- private Directory directory; // where this index resides
- private Analyzer analyzer; // how to analyze text
-
- private Similarity similarity = Similarity.getDefault(); // how to normalize
-
- private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
- private final Directory ramDirectory = new RAMDirectory(); // for temp segs
-
- private Lock writeLock;
-
- /** Use compound file setting. Defaults to true, minimizing the number of
- * files used. Setting this to false may improve indexing performance, but
- * may also cause file handle problems.
- */
- private boolean useCompoundFile = true;
-
- private boolean closeDir;
-
- /** Get the current setting of whether to use the compound file format.
- * Note that this just returns the value you set with setUseCompoundFile(boolean)
- * or the default. You cannot use this to query the status of an existing index.
- * @see #setUseCompoundFile(boolean)
- */
- public boolean getUseCompoundFile() {
- return useCompoundFile;
- }
-
- /** Setting to turn on usage of a compound file. When on, multiple files
- * for each segment are merged into a single file once the segment creation
- * is finished. This is done regardless of what directory is in use.
- */
- public void setUseCompoundFile(boolean value) {
- useCompoundFile = value;
- }
-
- /** Expert: Set the Similarity implementation used by this IndexWriter.
- *
- * @see Similarity#setDefault(Similarity)
- */
- public void setSimilarity(Similarity similarity) {
- this.similarity = similarity;
- }
-
- /** Expert: Return the Similarity implementation used by this IndexWriter.
- *
- * This defaults to the current value of {@link Similarity#getDefault()}.
- */
- public Similarity getSimilarity() {
- return this.similarity;
- }
-
- /**
- * Constructs an IndexWriter for the index in path.
- * Text will be analyzed with a. If create
- * is true, then a new, empty index will be created in
- * path, replacing the index already there, if any.
- *
- * @param path the path to the index directory
- * @param a the analyzer to use
- * @param create true to create the index or overwrite
- * the existing one; false to append to the existing
- * index
- * @throws IOException if the directory cannot be read/written to, or
- * if it does not exist, and create is
- * false
- */
- public IndexWriter(String path, Analyzer a, boolean create)
- throws IOException {
- this(FSDirectory.getDirectory(path, create), a, create, true);
- }
-
- /**
- * Constructs an IndexWriter for the index in path.
- * Text will be analyzed with a. If create
- * is true, then a new, empty index will be created in
- * path, replacing the index already there, if any.
- *
- * @param path the path to the index directory
- * @param a the analyzer to use
- * @param create true to create the index or overwrite
- * the existing one; false to append to the existing
- * index
- * @throws IOException if the directory cannot be read/written to, or
- * if it does not exist, and create is
- * false
- */
- public IndexWriter(File path, Analyzer a, boolean create)
- throws IOException {
- this(FSDirectory.getDirectory(path, create), a, create, true);
- }
-
- /**
- * Constructs an IndexWriter for the index in d.
- * Text will be analyzed with a. If create
- * is true, then a new, empty index will be created in
- * d, replacing the index already there, if any.
- *
- * @param d the index directory
- * @param a the analyzer to use
- * @param create true to create the index or overwrite
- * the existing one; false to append to the existing
- * index
- * @throws IOException if the directory cannot be read/written to, or
- * if it does not exist, and create is
- * false
- */
- public IndexWriter(Directory d, Analyzer a, boolean create)
- throws IOException {
- this(d, a, create, false);
- }
-
- private IndexWriter(Directory d, Analyzer a, final boolean create, boolean closeDir)
- throws IOException {
- this.closeDir = closeDir;
- directory = d;
- analyzer = a;
-
- Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME);
- if (!writeLock.obtain(WRITE_LOCK_TIMEOUT)) // obtain write lock
- throw new IOException("Index locked for write: " + writeLock);
- this.writeLock = writeLock; // save it
-
- synchronized (directory) { // in- & inter-process sync
- new Lock.With(directory.makeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) {
- public Object doBody() throws IOException {
- if (create)
- segmentInfos.write(directory);
- else
- segmentInfos.read(directory);
- return null;
- }
- }.run();
- }
- }
-
- /** Determines the largest number of documents ever merged by addDocument().
- * Small values (e.g., less than 10,000) are best for interactive indexing,
- * as this limits the length of pauses while indexing to a few seconds.
- * Larger values are best for batched indexing and speedier searches.
- *
- *
The default value is {@link Integer#MAX_VALUE}. - */ - public void setMaxMergeDocs(int maxMergeDocs) { - this.maxMergeDocs = maxMergeDocs; - } - - /** - * @see #setMaxMergeDocs - */ - public int getMaxMergeDocs() { - return maxMergeDocs; - } - - /** - * The maximum number of terms that will be indexed for a single field in a - * document. This limits the amount of memory required for indexing, so that - * collections with very large files will not crash the indexing process by - * running out of memory.
- * Note that this effectively truncates large documents, excluding from the - * index terms that occur further in the document. If you know your source - * documents are large, be sure to set this value high enough to accomodate - * the expected size. If you set it to Integer.MAX_VALUE, then the only limit - * is your memory, but you should anticipate an OutOfMemoryError. - * By default, no more than 10,000 terms will be indexed for a field. - */ - public void setMaxFieldLength(int maxFieldLength) { - this.maxFieldLength = maxFieldLength; - } - - /** - * @see #setMaxFieldLength - */ - public int getMaxFieldLength() { - return maxFieldLength; - } - - /** Determines the minimal number of documents required before the buffered - * in-memory documents are merging and a new Segment is created. - * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, - * large value gives faster indexing. At the same time, mergeFactor limits - * the number of files open in a FSDirectory. - * - *The default value is 10. - */ - public void setMaxBufferedDocs(int maxBufferedDocs) { - this.minMergeDocs = maxBufferedDocs; - } - - /** - * @see #setMaxBufferedDocs - */ - public int getMaxBufferedDocs() { - return minMergeDocs; - } - - /** Determines how often segment indices are merged by addDocument(). With - * smaller values, less RAM is used while indexing, and searches on - * unoptimized indices are faster, but indexing speed is slower. With larger - * values, more RAM is used during indexing, and while searches on unoptimized - * indices are slower, indexing is faster. Thus larger values (> 10) are best - * for batch index creation, and smaller values (< 10) for indices that are - * interactively maintained. - * - *
This must never be less than 2. The default value is 10. - */ - public void setMergeFactor(int mergeFactor) { - if (mergeFactor < 2) - throw new IllegalArgumentException("mergeFactor cannot be less than 2"); - this.mergeFactor = mergeFactor; - } - - /** - * @see #setMergeFactor - */ - public int getMergeFactor() { - return mergeFactor; - } - - /** If non-null, information about merges and a message when - * maxFieldLength is reached will be printed to this. - */ - public void setInfoStream(PrintStream infoStream) { - this.infoStream = infoStream; - } - - /** - * @see #setInfoStream - */ - public PrintStream getInfoStream() { - return infoStream; - } - - /** Flushes all changes to an index and closes all associated files. */ - public synchronized void close() throws IOException { - flushRamSegments(); - ramDirectory.close(); - if (writeLock != null) { - writeLock.release(); // release write lock - writeLock = null; - } - if(closeDir) - directory.close(); - } - - /** Release the write lock, if needed. */ - protected void finalize() throws IOException { - if (writeLock != null) { - writeLock.release(); // release write lock - writeLock = null; - } - } - - /** Returns the analyzer used by this index. */ - public Analyzer getAnalyzer() { - return analyzer; - } - - - /** Returns the number of documents currently in this index. */ - public synchronized int docCount() { - int count = 0; - for (int i = 0; i < segmentInfos.size(); i++) { - SegmentInfo si = segmentInfos.info(i); - count += si.docCount; - } - return count; - } - - /** - * The maximum number of terms that will be indexed for a single field in a - * document. This limits the amount of memory required for indexing, so that - * collections with very large files will not crash the indexing process by - * running out of memory.
- * Note that this effectively truncates large documents, excluding from the - * index terms that occur further in the document. If you know your source - * documents are large, be sure to set this value high enough to accomodate - * the expected size. If you set it to Integer.MAX_VALUE, then the only limit - * is your memory, but you should anticipate an OutOfMemoryError. - * By default, no more than 10,000 terms will be indexed for a field. - * - * @deprecated use {@link #setMaxFieldLength} instead - */ - public int maxFieldLength = DEFAULT_MAX_FIELD_LENGTH; - - /** - * Adds a document to this index. If the document contains more than - * {@link #maxFieldLength} terms for a given field, the remainder are - * discarded. - */ - public void addDocument(Document doc) throws IOException { - addDocument(doc, analyzer); - } - - /** - * Adds a document to this index, using the provided analyzer instead of the - * value of {@link #getAnalyzer()}. If the document contains more than - * {@link #maxFieldLength} terms for a given field, the remainder are - * discarded. - */ - public void addDocument(Document doc, Analyzer analyzer) throws IOException { - DocumentWriter dw = - new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength); - dw.setInfoStream(infoStream); - String segmentName = newSegmentName(); - dw.addDocument(segmentName, doc); - synchronized (this) { - segmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory)); - maybeMergeSegments(); - } - } - - final int getSegmentsCounter(){ - return segmentInfos.counter; - } - - private final synchronized String newSegmentName() { - return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX); - } - - /** Determines how often segment indices are merged by addDocument(). With - * smaller values, less RAM is used while indexing, and searches on - * unoptimized indices are faster, but indexing speed is slower. With larger - * values, more RAM is used during indexing, and while searches on unoptimized - * indices are slower, indexing is faster. Thus larger values (> 10) are best - * for batch index creation, and smaller values (< 10) for indices that are - * interactively maintained. - * - *This must never be less than 2. The default value is 10. - * @deprecated use {@link #setMergeFactor} instead - */ - public int mergeFactor = DEFAULT_MERGE_FACTOR; - - /** Determines the minimal number of documents required before the buffered - * in-memory documents are merging and a new Segment is created. - * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, - * large value gives faster indexing. At the same time, mergeFactor limits - * the number of files open in a FSDirectory. - * - *
The default value is 10. - * @deprecated use {@link #setMaxBufferedDocs} instead - */ - public int minMergeDocs = DEFAULT_MIN_MERGE_DOCS; - - - /** Determines the largest number of documents ever merged by addDocument(). - * Small values (e.g., less than 10,000) are best for interactive indexing, - * as this limits the length of pauses while indexing to a few seconds. - * Larger values are best for batched indexing and speedier searches. - * - *
The default value is {@link Integer#MAX_VALUE}. - * @deprecated use {@link #setMaxMergeDocs} instead - */ - public int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; - - /** If non-null, information about merges will be printed to this. - * @deprecated use {@link #setInfoStream} instead - */ - public PrintStream infoStream = null; - - /** Merges all segments together into a single segment, optimizing an index - for search. */ - public synchronized void optimize() throws IOException { - flushRamSegments(); - while (segmentInfos.size() > 1 || - (segmentInfos.size() == 1 && - (SegmentReader.hasDeletions(segmentInfos.info(0)) || - segmentInfos.info(0).dir != directory || - (useCompoundFile && - (!SegmentReader.usesCompoundFile(segmentInfos.info(0)) || - SegmentReader.hasSeparateNorms(segmentInfos.info(0))))))) { - int minSegment = segmentInfos.size() - mergeFactor; - mergeSegments(minSegment < 0 ? 0 : minSegment); - } - } - - /** Merges all segments from an array of indexes into this index. - * - *
This may be used to parallelize batch indexing. A large document - * collection can be broken into sub-collections. Each sub-collection can be - * indexed in parallel, on a different thread, process or machine. The - * complete index can then be created by merging sub-collection indexes - * with this method. - * - *
After this completes, the index is optimized. */ - public synchronized void addIndexes(Directory[] dirs) - throws IOException { - optimize(); // start with zero or 1 seg - for (int i = 0; i < dirs.length; i++) { - SegmentInfos sis = new SegmentInfos(); // read infos from dir - sis.read(dirs[i]); - for (int j = 0; j < sis.size(); j++) { - segmentInfos.addElement(sis.info(j)); // add each info - } - } - optimize(); // final cleanup - } - - /** Merges the provided indexes into this index. - *
After this completes, the index is optimized.
- *The provided IndexReaders are not closed.
- */ - public synchronized void addIndexes(IndexReader[] readers) - throws IOException { - - optimize(); // start with zero or 1 seg - - final String mergedName = newSegmentName(); - SegmentMerger merger = new SegmentMerger(directory, mergedName); - - final Vector segmentsToDelete = new Vector(); - IndexReader sReader = null; - if (segmentInfos.size() == 1){ // add existing index, if any - sReader = SegmentReader.get(segmentInfos.info(0)); - merger.add(sReader); - segmentsToDelete.addElement(sReader); // queue segment for deletion - } - - for (int i = 0; i < readers.length; i++) // add new indexes - merger.add(readers[i]); - - int docCount = merger.merge(); // merge 'em - - segmentInfos.setSize(0); // pop old infos & add new - segmentInfos.addElement(new SegmentInfo(mergedName, docCount, directory)); - - if(sReader != null) - sReader.close(); - - synchronized (directory) { // in- & inter-process sync - new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - segmentInfos.write(directory); // commit changes - deleteSegments(segmentsToDelete); // delete now-unused segments - return null; - } - }.run(); - } - - if (useCompoundFile) { - final Vector filesToDelete = merger.createCompoundFile(mergedName + ".tmp"); - synchronized (directory) { // in- & inter-process sync - new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - // make compound file visible for SegmentReaders - directory.renameFile(mergedName + ".tmp", mergedName + ".cfs"); - // delete now unused files of segment - deleteFiles(filesToDelete); - return null; - } - }.run(); - } - } - } - - /** Merges all RAM-resident segments. */ - private final void flushRamSegments() throws IOException { - int minSegment = segmentInfos.size()-1; - int docCount = 0; - while (minSegment >= 0 && - (segmentInfos.info(minSegment)).dir == ramDirectory) { - docCount += segmentInfos.info(minSegment).docCount; - minSegment--; - } - if (minSegment < 0 || // add one FS segment? - (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor || - !(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory)) - minSegment++; - if (minSegment >= segmentInfos.size()) - return; // none to merge - mergeSegments(minSegment); - } - - /** Incremental segment merger. */ - private final void maybeMergeSegments() throws IOException { - long targetMergeDocs = minMergeDocs; - while (targetMergeDocs <= maxMergeDocs) { - // find segments smaller than current target size - int minSegment = segmentInfos.size(); - int mergeDocs = 0; - while (--minSegment >= 0) { - SegmentInfo si = segmentInfos.info(minSegment); - if (si.docCount >= targetMergeDocs) - break; - mergeDocs += si.docCount; - } - - if (mergeDocs >= targetMergeDocs) // found a merge to do - mergeSegments(minSegment+1); - else - break; - - targetMergeDocs *= mergeFactor; // increase target size - } - } - - /** Pops segments off of segmentInfos stack down to minSegment, merges them, - and pushes the merged index onto the top of the segmentInfos stack. */ - private final void mergeSegments(int minSegment) - throws IOException { - final String mergedName = newSegmentName(); - if (infoStream != null) infoStream.print("merging segments"); - SegmentMerger merger = - new SegmentMerger(directory, mergedName); - - final Vector segmentsToDelete = new Vector(); - for (int i = minSegment; i < segmentInfos.size(); i++) { - SegmentInfo si = segmentInfos.info(i); - if (infoStream != null) - infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); - IndexReader reader = SegmentReader.get(si); - merger.add(reader); - if ((reader.directory() == this.directory) || // if we own the directory - (reader.directory() == this.ramDirectory)) - segmentsToDelete.addElement(reader); // queue segment for deletion - } - - int mergedDocCount = merger.merge(); - - if (infoStream != null) { - infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)"); - } - - segmentInfos.setSize(minSegment); // pop old infos & add new - segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount, - directory)); - - // close readers before we attempt to delete now-obsolete segments - merger.closeReaders(); - - synchronized (directory) { // in- & inter-process sync - new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - segmentInfos.write(directory); // commit before deleting - deleteSegments(segmentsToDelete); // delete now-unused segments - return null; - } - }.run(); - } - - if (useCompoundFile) { - final Vector filesToDelete = merger.createCompoundFile(mergedName + ".tmp"); - synchronized (directory) { // in- & inter-process sync - new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) { - public Object doBody() throws IOException { - // make compound file visible for SegmentReaders - directory.renameFile(mergedName + ".tmp", mergedName + ".cfs"); - // delete now unused files of segment - deleteFiles(filesToDelete); - return null; - } - }.run(); - } - } - } - - /* - * Some operating systems (e.g. Windows) don't permit a file to be deleted - * while it is opened for read (e.g. by another process or thread). So we - * assume that when a delete fails it is because the file is open in another - * process, and queue the file for subsequent deletion. - */ - - private final void deleteSegments(Vector segments) throws IOException { - Vector deletable = new Vector(); - - deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable - - for (int i = 0; i < segments.size(); i++) { - SegmentReader reader = (SegmentReader)segments.elementAt(i); - if (reader.directory() == this.directory) - deleteFiles(reader.files(), deletable); // try to delete our files - else - deleteFiles(reader.files(), reader.directory()); // delete other files - } - - writeDeleteableFiles(deletable); // note files we can't delete - } - - private final void deleteFiles(Vector files) throws IOException { - Vector deletable = new Vector(); - deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable - deleteFiles(files, deletable); // try to delete our files - writeDeleteableFiles(deletable); // note files we can't delete - } - - private final void deleteFiles(Vector files, Directory directory) - throws IOException { - for (int i = 0; i < files.size(); i++) - directory.deleteFile((String)files.elementAt(i)); - } - - private final void deleteFiles(Vector files, Vector deletable) - throws IOException { - for (int i = 0; i < files.size(); i++) { - String file = (String)files.elementAt(i); - try { - directory.deleteFile(file); // try to delete each file - } catch (IOException e) { // if delete fails - if (directory.fileExists(file)) { - if (infoStream != null) - infoStream.println(e.toString() + "; Will re-try later."); - deletable.addElement(file); // add to deletable - } - } - } - } - - private final Vector readDeleteableFiles() throws IOException { - Vector result = new Vector(); - if (!directory.fileExists("deletable")) - return result; - - IndexInput input = directory.openInput("deletable"); - try { - for (int i = input.readInt(); i > 0; i--) // read file names - result.addElement(input.readString()); - } finally { - input.close(); - } - return result; - } - - private final void writeDeleteableFiles(Vector files) throws IOException { - IndexOutput output = directory.createOutput("deleteable.new"); - try { - output.writeInt(files.size()); - for (int i = 0; i < files.size(); i++) - output.writeString((String)files.elementAt(i)); - } finally { - output.close(); - } - directory.renameFile("deleteable.new", "deletable"); - } + /** + * Default value is 1000. Use + *org.apache.lucene.writeLockTimeout system property to
+ * override.
+ */
+ public static long WRITE_LOCK_TIMEOUT = Integer.parseInt(System
+ .getProperty("org.apache.lucene.writeLockTimeout", "1000"));
+
+ /**
+ * Default value is 10000. Use
+ * org.apache.lucene.commitLockTimeout system property to
+ * override.
+ */
+ public static long COMMIT_LOCK_TIMEOUT = Integer.parseInt(System
+ .getProperty("org.apache.lucene.commitLockTimeout", "10000"));
+
+ public static final String WRITE_LOCK_NAME = "write.lock";
+
+ public static final String COMMIT_LOCK_NAME = "commit.lock";
+
+ /**
+ * Default value is 10. Use org.apache.lucene.mergeFactor
+ * system property to override.
+ */
+ public static final int DEFAULT_MERGE_FACTOR = Integer.parseInt(System
+ .getProperty("org.apache.lucene.mergeFactor", "10"));
+
+ /**
+ * Default value is 10. Use org.apache.lucene.minMergeDocs
+ * system property to override.
+ */
+ public static final int DEFAULT_MIN_MERGE_DOCS = Integer.parseInt(System
+ .getProperty("org.apache.lucene.minMergeDocs", "10"));
+
+ /**
+ * Default value is {@link Integer#MAX_VALUE}. Use
+ * org.apache.lucene.maxMergeDocs system property to
+ * override.
+ */
+ public static final int DEFAULT_MAX_MERGE_DOCS = Integer.parseInt(System
+ .getProperty("org.apache.lucene.maxMergeDocs", String
+ .valueOf(Integer.MAX_VALUE)));
+
+ /**
+ * Default value is 10000. Use org.apache.lucene.maxFieldLength
+ * system property to override.
+ */
+ public static final int DEFAULT_MAX_FIELD_LENGTH = Integer.parseInt(System
+ .getProperty("org.apache.lucene.maxFieldLength", "10000"));
+
+ private Directory directory; // where this index resides
+
+ private Analyzer analyzer; // how to analyze text
+
+ private Similarity similarity = Similarity.getDefault(); // how to normalize
+
+ private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
+
+ private final Directory ramDirectory = new RAMDirectory(); // for temp segs
+
+ private Lock writeLock;
+
+ /**
+ * Use compound file setting. Defaults to true, minimizing the number of
+ * files used. Setting this to false may improve indexing performance, but
+ * may also cause file handle problems.
+ */
+ private boolean useCompoundFile = true;
+
+ private boolean closeDir;
+
+ /**
+ * Get the current setting of whether to use the compound file format. Note
+ * that this just returns the value you set with setUseCompoundFile(boolean)
+ * or the default. You cannot use this to query the status of an existing
+ * index.
+ *
+ * @see #setUseCompoundFile(boolean)
+ */
+ public boolean getUseCompoundFile() {
+ return useCompoundFile;
+ }
+
+ /**
+ * Setting to turn on usage of a compound file. When on, multiple files for
+ * each segment are merged into a single file once the segment creation is
+ * finished. This is done regardless of what directory is in use.
+ */
+ public void setUseCompoundFile(boolean value) {
+ useCompoundFile = value;
+ }
+
+ /**
+ * Expert: Set the Similarity implementation used by this IndexWriter.
+ *
+ * @see Similarity#setDefault(Similarity)
+ */
+ public void setSimilarity(Similarity similarity) {
+ this.similarity = similarity;
+ }
+
+ /**
+ * Expert: Return the Similarity implementation used by this IndexWriter.
+ *
+ *
+ * This defaults to the current value of {@link Similarity#getDefault()}.
+ */
+ public Similarity getSimilarity() {
+ return this.similarity;
+ }
+
+ /**
+ * Constructs an IndexWriter for the index in path. Text
+ * will be analyzed with a. If create is
+ * true, then a new, empty index will be created in path,
+ * replacing the index already there, if any.
+ *
+ * @param path
+ * the path to the index directory
+ * @param a
+ * the analyzer to use
+ * @param create
+ * true to create the index or overwrite the
+ * existing one; false to append to the existing
+ * index
+ * @throws IOException
+ * if the directory cannot be read/written to, or if it does not
+ * exist, and create is false
+ */
+ public IndexWriter(String path, Analyzer a, boolean create)
+ throws IOException {
+ this(FSDirectory.getDirectory(path, create), a, create, true);
+ }
+
+ /**
+ * Constructs an IndexWriter for the index in path. Text
+ * will be analyzed with a. If create is
+ * true, then a new, empty index will be created in path,
+ * replacing the index already there, if any.
+ *
+ * @param path
+ * the path to the index directory
+ * @param a
+ * the analyzer to use
+ * @param create
+ * true to create the index or overwrite the
+ * existing one; false to append to the existing
+ * index
+ * @throws IOException
+ * if the directory cannot be read/written to, or if it does not
+ * exist, and create is false
+ */
+ public IndexWriter(File path, Analyzer a, boolean create)
+ throws IOException {
+ this(FSDirectory.getDirectory(path, create), a, create, true);
+ }
+
+ /**
+ * Constructs an IndexWriter for the index in d. Text will
+ * be analyzed with a. If create is true,
+ * then a new, empty index will be created in d, replacing
+ * the index already there, if any.
+ *
+ * @param d
+ * the index directory
+ * @param a
+ * the analyzer to use
+ * @param create
+ * true to create the index or overwrite the
+ * existing one; false to append to the existing
+ * index
+ * @throws IOException
+ * if the directory cannot be read/written to, or if it does not
+ * exist, and create is false
+ */
+ public IndexWriter(Directory d, Analyzer a, boolean create)
+ throws IOException {
+ this(d, a, create, false);
+ }
+
+ private IndexWriter(Directory d, Analyzer a, final boolean create,
+ boolean closeDir) throws IOException {
+ this.closeDir = closeDir;
+ directory = d;
+ analyzer = a;
+
+ Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME);
+ if (!writeLock.obtain(WRITE_LOCK_TIMEOUT)) // obtain write lock
+ throw new IOException("Index locked for write: " + writeLock);
+ this.writeLock = writeLock; // save it
+
+ synchronized (directory) { // in- & inter-process sync
+ new Lock.With(directory.makeLock(IndexWriter.COMMIT_LOCK_NAME),
+ COMMIT_LOCK_TIMEOUT) {
+ public Object doBody() throws IOException {
+ if (create)
+ segmentInfos.write(directory);
+ else
+ segmentInfos.read(directory);
+ return null;
+ }
+ }.run();
+ }
+ }
+
+ /**
+ * Determines the largest number of documents ever merged by addDocument().
+ * Small values (e.g., less than 10,000) are best for interactive indexing,
+ * as this limits the length of pauses while indexing to a few seconds.
+ * Larger values are best for batched indexing and speedier searches.
+ *
+ *
+ * The default value is {@link Integer#MAX_VALUE}. + */ + public void setMaxMergeDocs(int maxMergeDocs) { + this.maxMergeDocs = maxMergeDocs; + } + + /** + * @see #setMaxMergeDocs + */ + public int getMaxMergeDocs() { + return maxMergeDocs; + } + + /** + * The maximum number of terms that will be indexed for a single field in a + * document. This limits the amount of memory required for indexing, so that + * collections with very large files will not crash the indexing process by + * running out of memory.
Note that this effectively truncates large + * documents, excluding from the index terms that occur further in the + * document. If you know your source documents are large, be sure to set + * this value high enough to accomodate the expected size. If you set it to + * Integer.MAX_VALUE, then the only limit is your memory, but you should + * anticipate an OutOfMemoryError. By default, no more than 10,000 + * terms will be indexed for a field. + */ + public void setMaxFieldLength(int maxFieldLength) { + this.maxFieldLength = maxFieldLength; + } + + /** + * @see #setMaxFieldLength + */ + public int getMaxFieldLength() { + return maxFieldLength; + } + + /** + * Determines the minimal number of documents required before the buffered + * in-memory documents are merging and a new Segment is created. Since + * Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, + * large value gives faster indexing. At the same time, mergeFactor limits + * the number of files open in a FSDirectory. + * + *+ * The default value is 10. + */ + public void setMaxBufferedDocs(int maxBufferedDocs) { + this.minMergeDocs = maxBufferedDocs; + } + + /** + * @see #setMaxBufferedDocs + */ + public int getMaxBufferedDocs() { + return minMergeDocs; + } + + /** + * Determines how often segment indices are merged by addDocument(). With + * smaller values, less RAM is used while indexing, and searches on + * unoptimized indices are faster, but indexing speed is slower. With larger + * values, more RAM is used during indexing, and while searches on + * unoptimized indices are slower, indexing is faster. Thus larger values (> + * 10) are best for batch index creation, and smaller values ( < 10) for + * indices that are interactively maintained. + * + *
+ * This must never be less than 2. The default value is 10. + */ + public void setMergeFactor(int mergeFactor) { + if (mergeFactor < 2) + throw new IllegalArgumentException( + "mergeFactor cannot be less than 2"); + this.mergeFactor = mergeFactor; + } + + /** + * @see #setMergeFactor + */ + public int getMergeFactor() { + return mergeFactor; + } + + /** + * If non-null, information about merges and a message when maxFieldLength + * is reached will be printed to this. + */ + public void setInfoStream(PrintStream infoStream) { + this.infoStream = infoStream; + } + + /** + * @see #setInfoStream + */ + public PrintStream getInfoStream() { + return infoStream; + } + + /** Flushes all changes to an index and closes all associated files. */ + public synchronized void close() throws IOException { + flushRamSegments(); + ramDirectory.close(); + if (writeLock != null) { + writeLock.release(); // release write lock + writeLock = null; + } + if (closeDir) + directory.close(); + } + + /** Release the write lock, if needed. */ + protected void finalize() throws IOException { + if (writeLock != null) { + writeLock.release(); // release write lock + writeLock = null; + } + } + + /** Returns the analyzer used by this index. */ + public Analyzer getAnalyzer() { + return analyzer; + } + + /** Returns the number of documents currently in this index. */ + public synchronized int docCount() { + int count = 0; + for (int i = 0; i < segmentInfos.size(); i++) { + SegmentInfo si = segmentInfos.info(i); + count += si.docCount; + } + return count; + } + + /** + * The maximum number of terms that will be indexed for a single field in a + * document. This limits the amount of memory required for indexing, so that + * collections with very large files will not crash the indexing process by + * running out of memory.
Note that this effectively truncates large + * documents, excluding from the index terms that occur further in the + * document. If you know your source documents are large, be sure to set + * this value high enough to accomodate the expected size. If you set it to + * Integer.MAX_VALUE, then the only limit is your memory, but you should + * anticipate an OutOfMemoryError. By default, no more than 10,000 + * terms will be indexed for a field. + * + * @deprecated use {@link #setMaxFieldLength}instead + */ + public int maxFieldLength = DEFAULT_MAX_FIELD_LENGTH; + + /** + * Adds a document to this index. If the document contains more than + * {@link #maxFieldLength}terms for a given field, the remainder are + * discarded. + */ + public void addDocument(Document doc) throws IOException { + addDocument(doc, analyzer); + } + + /** + * Adds a document to this index, using the provided analyzer instead of the + * value of {@link #getAnalyzer()}. If the document contains more than + * {@link #maxFieldLength}terms for a given field, the remainder are + * discarded. + */ + public void addDocument(Document doc, Analyzer analyzer) throws IOException { + DocumentWriter dw = new DocumentWriter(ramDirectory, analyzer, + similarity, maxFieldLength); + dw.setInfoStream(infoStream); + String segmentName = newSegmentName(); + dw.addDocument(segmentName, doc); + synchronized (this) { + segmentInfos.addElement(new SegmentInfo(segmentName, 1, + ramDirectory)); + maybeMergeSegments(); + } + } + + final int getSegmentsCounter() { + return segmentInfos.counter; + } + + private final synchronized String newSegmentName() { + return "_" + + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX); + } + + /** + * Determines how often segment indices are merged by addDocument(). With + * smaller values, less RAM is used while indexing, and searches on + * unoptimized indices are faster, but indexing speed is slower. With larger + * values, more RAM is used during indexing, and while searches on + * unoptimized indices are slower, indexing is faster. Thus larger values (> + * 10) are best for batch index creation, and smaller values ( < 10) for + * indices that are interactively maintained. + * + *+ * This must never be less than 2. The default value is 10. + * + * @deprecated use {@link #setMergeFactor}instead + */ + public int mergeFactor = DEFAULT_MERGE_FACTOR; + + /** + * Determines the minimal number of documents required before the buffered + * in-memory documents are merging and a new Segment is created. Since + * Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, + * large value gives faster indexing. At the same time, mergeFactor limits + * the number of files open in a FSDirectory. + * + *
+ * The default value is 10. + * + * @deprecated use {@link #setMaxBufferedDocs}instead + */ + public int minMergeDocs = DEFAULT_MIN_MERGE_DOCS; + + /** + * Determines the largest number of documents ever merged by addDocument(). + * Small values (e.g., less than 10,000) are best for interactive indexing, + * as this limits the length of pauses while indexing to a few seconds. + * Larger values are best for batched indexing and speedier searches. + * + *
+ * The default value is {@link Integer#MAX_VALUE}. + * + * @deprecated use {@link #setMaxMergeDocs}instead + */ + public int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; + + /** + * If non-null, information about merges will be printed to this. + * + * @deprecated use {@link #setInfoStream}instead + */ + public PrintStream infoStream = null; + + /** + * Merges all segments together into a single segment, optimizing an index + * for search. + */ + public synchronized void optimize() throws IOException { + flushRamSegments(); + while (segmentInfos.size() > 1 + || (segmentInfos.size() == 1 && (SegmentReader + .hasDeletions(segmentInfos.info(0)) + || segmentInfos.info(0).dir != directory || (useCompoundFile && (!SegmentReader + .usesCompoundFile(segmentInfos.info(0)) || SegmentReader + .hasSeparateNorms(segmentInfos.info(0))))))) { + int minSegment = segmentInfos.size() - mergeFactor; + mergeSegments(minSegment < 0 ? 0 : minSegment); + } + } + + /** + * Merges all segments from an array of indexes into this index. + * + *
+ * This may be used to parallelize batch indexing. A large document + * collection can be broken into sub-collections. Each sub-collection can be + * indexed in parallel, on a different thread, process or machine. The + * complete index can then be created by merging sub-collection indexes with + * this method. + * + *
+ * After this completes, the index is optimized. + */ + public synchronized void addIndexes(Directory[] dirs) throws IOException { + optimize(); // start with zero or 1 seg + for (int i = 0; i < dirs.length; i++) { + SegmentInfos sis = new SegmentInfos(); // read infos from dir + sis.read(dirs[i]); + for (int j = 0; j < sis.size(); j++) { + segmentInfos.addElement(sis.info(j)); // add each info + } + } + optimize(); // final cleanup + } + + /** + * Merges the provided indexes into this index. + *
+ * After this completes, the index is optimized. + *
+ *+ * The provided IndexReaders are not closed. + *
+ */ + public synchronized void addIndexes(IndexReader[] readers) + throws IOException { + + optimize(); // start with zero or 1 seg + + final String mergedName = newSegmentName(); + SegmentMerger merger = new SegmentMerger(directory, mergedName); + + final Vector segmentsToDelete = new Vector(); + IndexReader sReader = null; + if (segmentInfos.size() == 1) { // add existing index, if any + sReader = SegmentReader.get(segmentInfos.info(0)); + merger.add(sReader); + segmentsToDelete.addElement(sReader); // queue segment for deletion + } + + for (int i = 0; i < readers.length; i++) + // add new indexes + merger.add(readers[i]); + + int docCount = merger.merge(); // merge 'em + + segmentInfos.setSize(0); // pop old infos & add new + segmentInfos + .addElement(new SegmentInfo(mergedName, docCount, directory)); + + if (sReader != null) + sReader.close(); + + synchronized (directory) { // in- & inter-process sync + new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), + COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + segmentInfos.write(directory); // commit changes + deleteSegments(segmentsToDelete); // delete now-unused + // segments + return null; + } + }.run(); + } + + if (useCompoundFile) { + final Vector filesToDelete = merger.createCompoundFile(mergedName + + ".tmp"); + synchronized (directory) { // in- & inter-process sync + new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), + COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + // make compound file visible for SegmentReaders + directory.renameFile(mergedName + ".tmp", mergedName + + ".cfs"); + // delete now unused files of segment + deleteFiles(filesToDelete); + return null; + } + }.run(); + } + } + } + + /** Merges all RAM-resident segments. */ + private final void flushRamSegments() throws IOException { + int minSegment = segmentInfos.size() - 1; + int docCount = 0; + while (minSegment >= 0 + && (segmentInfos.info(minSegment)).dir == ramDirectory) { + docCount += segmentInfos.info(minSegment).docCount; + minSegment--; + } + if (minSegment < 0 + || // add one FS segment? + (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor + || !(segmentInfos.info(segmentInfos.size() - 1).dir == ramDirectory)) + minSegment++; + if (minSegment >= segmentInfos.size()) + return; // none to merge + mergeSegments(minSegment); + } + + /** Incremental segment merger. */ + private final void maybeMergeSegments() throws IOException { + long targetMergeDocs = minMergeDocs; + while (targetMergeDocs <= maxMergeDocs) { + // find segments smaller than current target size + int minSegment = segmentInfos.size(); + int mergeDocs = 0; + while (--minSegment >= 0) { + SegmentInfo si = segmentInfos.info(minSegment); + if (si.docCount >= targetMergeDocs) + break; + mergeDocs += si.docCount; + } + + if (mergeDocs >= targetMergeDocs) // found a merge to do + mergeSegments(minSegment + 1); + else + break; + + targetMergeDocs *= mergeFactor; // increase target size + } + } + + /** + * Pops segments off of segmentInfos stack down to minSegment, merges them, + * and pushes the merged index onto the top of the segmentInfos stack. + */ + private final void mergeSegments(int minSegment) throws IOException { + final String mergedName = newSegmentName(); + if (infoStream != null) + infoStream.print("merging segments"); + SegmentMerger merger = new SegmentMerger(directory, mergedName); + + final Vector segmentsToDelete = new Vector(); + for (int i = minSegment; i < segmentInfos.size(); i++) { + SegmentInfo si = segmentInfos.info(i); + if (infoStream != null) + infoStream.print(" " + si.name + " (" + si.docCount + " docs)"); + IndexReader reader = SegmentReader.get(si); + merger.add(reader); + if ((reader.directory() == this.directory) || // if we own the + // directory + (reader.directory() == this.ramDirectory)) + segmentsToDelete.addElement(reader); // queue segment for + // deletion + } + + int mergedDocCount = merger.merge(); + + if (infoStream != null) { + infoStream.println(" into " + mergedName + " (" + mergedDocCount + + " docs)"); + } + + segmentInfos.setSize(minSegment); // pop old infos & add new + segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount, + directory)); + + // close readers before we attempt to delete now-obsolete segments + merger.closeReaders(); + + synchronized (directory) { // in- & inter-process sync + new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), + COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + segmentInfos.write(directory); // commit before deleting + deleteSegments(segmentsToDelete); // delete now-unused + // segments + return null; + } + }.run(); + } + + if (useCompoundFile) { + final Vector filesToDelete = merger.createCompoundFile(mergedName + + ".tmp"); + synchronized (directory) { // in- & inter-process sync + new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), + COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + // make compound file visible for SegmentReaders + directory.renameFile(mergedName + ".tmp", mergedName + + ".cfs"); + // delete now unused files of segment + deleteFiles(filesToDelete); + return null; + } + }.run(); + } + } + } + + /** + * Expunges the deleted documents. + * Compacts and merges all segments containing deleted documents. + * @throws IOException + */ + public final void expungeDeleted() throws IOException { + final Vector segmentsToDelete = new Vector(); + final SegmentInfos newInfos = new SegmentInfos(); + + if (infoStream != null) { + infoStream.print("Expunging deleted documents."); + } + + // iterate thru all the segment infos + for (int i = 0; i < segmentInfos.size(); i++) { + SegmentInfo si = segmentInfos.info(i); + + Directory directory = si.dir; + + if (SegmentReader.hasDeletions(si)) { // found a deletion + + // make a new segment and merge itself + String newSegment = newSegmentName(); + + SegmentReader reader = SegmentReader.get(si); + SegmentMerger merger = new SegmentMerger(directory, newSegment); + merger.add(reader); + int newDocCount = merger.merge(); + merger.closeReaders(); + + // do the compound file thing + if (useCompoundFile) { + final Vector filesToDelete = merger + .createCompoundFile(newSegment + ".tmp"); + synchronized (directory) { // in- & inter-process sync + + boolean locked = false; + Lock lock = directory.makeLock(COMMIT_LOCK_NAME); + try { + locked = lock.obtain(COMMIT_LOCK_TIMEOUT); + // make compound file visible for SegmentReaders + directory.renameFile(newSegment + ".tmp", + newSegment + ".cfs"); + // delete now unused files of segment + deleteFiles(filesToDelete); + } finally { + if (locked) + lock.release(); + } + } + } + + if ((reader.directory() == this.directory) || // if we own the + // directory + (reader.directory() == this.ramDirectory)) { + segmentsToDelete.add(reader); + } + + newInfos + .add(new SegmentInfo(newSegment, newDocCount, directory)); + } else { + newInfos.add(si); + } + } + + // rewrite the segment info if there is something changed + if (segmentsToDelete.size() > 0) { + synchronized (directory) { // in- & inter-process sync + new Lock.With(directory.makeLock(COMMIT_LOCK_NAME), + COMMIT_LOCK_TIMEOUT) { + public Object doBody() throws IOException { + newInfos.write(directory); // commit before deleting + deleteSegments(segmentsToDelete); // delete now-unused + // segments + return null; + } + }.run(); + } + } + } + + /* + * Some operating systems (e.g. Windows) don't permit a file to be deleted + * while it is opened for read (e.g. by another process or thread). So we + * assume that when a delete fails it is because the file is open in another + * process, and queue the file for subsequent deletion. + */ + + private final void deleteSegments(Vector segments) throws IOException { + Vector deletable = new Vector(); + + deleteFiles(readDeleteableFiles(), deletable); // try to delete + // deleteable + + for (int i = 0; i < segments.size(); i++) { + SegmentReader reader = (SegmentReader) segments.elementAt(i); + if (reader.directory() == this.directory) + deleteFiles(reader.files(), deletable); // try to delete our + // files + else + deleteFiles(reader.files(), reader.directory()); // delete other + // files + } + + writeDeleteableFiles(deletable); // note files we can't delete + } + + private final void deleteFiles(Vector files) throws IOException { + Vector deletable = new Vector(); + deleteFiles(readDeleteableFiles(), deletable); // try to delete + // deleteable + deleteFiles(files, deletable); // try to delete our files + writeDeleteableFiles(deletable); // note files we can't delete + } + + private final void deleteFiles(Vector files, Directory directory) + throws IOException { + for (int i = 0; i < files.size(); i++) + directory.deleteFile((String) files.elementAt(i)); + } + + private final void deleteFiles(Vector files, Vector deletable) + throws IOException { + for (int i = 0; i < files.size(); i++) { + String file = (String) files.elementAt(i); + try { + directory.deleteFile(file); // try to delete each file + } catch (IOException e) { // if delete fails + if (directory.fileExists(file)) { + if (infoStream != null) + infoStream.println(e.toString() + + "; Will re-try later."); + deletable.addElement(file); // add to deletable + } + } + } + } + + private final Vector readDeleteableFiles() throws IOException { + Vector result = new Vector(); + if (!directory.fileExists("deletable")) + return result; + + IndexInput input = directory.openInput("deletable"); + try { + for (int i = input.readInt(); i > 0; i--) + // read file names + result.addElement(input.readString()); + } finally { + input.close(); + } + return result; + } + + private final void writeDeleteableFiles(Vector files) throws IOException { + IndexOutput output = directory.createOutput("deleteable.new"); + try { + output.writeInt(files.size()); + for (int i = 0; i < files.size(); i++) + output.writeString((String) files.elementAt(i)); + } finally { + output.close(); + } + directory.renameFile("deleteable.new", "deletable"); + } }