Index: CHANGES.txt
===================================================================
--- CHANGES.txt (revision 618010)
+++ CHANGES.txt (working copy)
@@ -18,6 +18,11 @@
compatibility will be removed in 3.0 (hardwiring the value to
true). (Mike McCandless)
+ 2. LUCENE-1044: IndexWriter with autoCommit=true now commits (such
+ that a reader can see the changes) far less often than it used to.
+ Previously, every flush was also a commit. You can always force a
+ commit by calling IndexWriter.commit(). (Mike McCandless)
+
API Changes
1. LUCENE-1084: Changed all IndexWriter constructors to take an
@@ -33,6 +38,11 @@
java.util.BitSet. This allows using more efficient data structures
for Filters and makes them more flexible. (Paul Elschot, Michael Busch)
+ 4. LUCENE-1044: Added IndexWriter.commit() which flushes any buffered
+ adds/deletes and then commits a new segments file so readers will
+ see the changes. Deprecate IndexWriter.flush() in favor of
+ IndexWriter.commit(). (Mike McCandless)
+
Bug fixes
1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimze a single
@@ -55,6 +65,11 @@
the core Filters to use OpenBitSet instead of java.util.BitSet.
(Paul Elschot, Michael Busch)
+ 5. LUCENE-1044: Change Lucene to properly "sync" files after
+ committing, to ensure on a machine or OS crash or power cut, even
+ with cached writes, the index remains consistent. (Mike
+ McCandless)
+
Optimizations
1. LUCENE-705: When building a compound file, use
Index: src/test/org/apache/lucene/store/MockRAMInputStream.java
===================================================================
--- src/test/org/apache/lucene/store/MockRAMInputStream.java (revision 618010)
+++ src/test/org/apache/lucene/store/MockRAMInputStream.java (working copy)
@@ -45,11 +45,14 @@
if (!isClone) {
synchronized(dir.openFiles) {
Integer v = (Integer) dir.openFiles.get(name);
- if (v.intValue() == 1) {
- dir.openFiles.remove(name);
- } else {
- v = new Integer(v.intValue()-1);
- dir.openFiles.put(name, v);
+ // Could be null when MockRAMDirectory.crash() was called
+ if (v != null) {
+ if (v.intValue() == 1) {
+ dir.openFiles.remove(name);
+ } else {
+ v = new Integer(v.intValue()-1);
+ dir.openFiles.put(name, v);
+ }
}
}
}
Index: src/test/org/apache/lucene/store/MockRAMOutputStream.java
===================================================================
--- src/test/org/apache/lucene/store/MockRAMOutputStream.java (revision 618010)
+++ src/test/org/apache/lucene/store/MockRAMOutputStream.java (working copy)
@@ -63,6 +63,11 @@
long freeSpace = dir.maxSize - dir.sizeInBytes();
long realUsage = 0;
+ // If MockRAMDir crashed since we were opened, then
+ // don't write anything:
+ if (dir.crashed)
+ throw new IOException("MockRAMDirectory was crashed");
+
// Enforce disk full:
if (dir.maxSize != 0 && freeSpace <= len) {
// Compute the real disk free. This will greatly slow
Index: src/test/org/apache/lucene/store/MockRAMDirectory.java
===================================================================
--- src/test/org/apache/lucene/store/MockRAMDirectory.java (revision 618010)
+++ src/test/org/apache/lucene/store/MockRAMDirectory.java (working copy)
@@ -24,7 +24,10 @@
import java.util.Random;
import java.util.Map;
import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
import java.util.ArrayList;
+import java.util.Arrays;
/**
* This is a subclass of RAMDirectory that adds methods
@@ -40,6 +43,10 @@
double randomIOExceptionRate;
Random randomState;
boolean noDeleteOpenFile = true;
+ boolean preventDoubleWrite = true;
+ private Set unSyncedFiles;
+ private Set createdFiles;
+ volatile boolean crashed;
// NOTE: we cannot initialize the Map here due to the
// order in which our constructor actually does this
@@ -47,31 +54,80 @@
// like super is called, then our members are initialized:
Map openFiles;
+ private void init() {
+ if (openFiles == null)
+ openFiles = new HashMap();
+ if (createdFiles == null)
+ createdFiles = new HashSet();
+ if (unSyncedFiles == null)
+ unSyncedFiles = new HashSet();
+ }
+
public MockRAMDirectory() {
super();
- if (openFiles == null) {
- openFiles = new HashMap();
- }
+ init();
}
public MockRAMDirectory(String dir) throws IOException {
super(dir);
- if (openFiles == null) {
- openFiles = new HashMap();
- }
+ init();
}
public MockRAMDirectory(Directory dir) throws IOException {
super(dir);
- if (openFiles == null) {
- openFiles = new HashMap();
- }
+ init();
}
public MockRAMDirectory(File dir) throws IOException {
super(dir);
- if (openFiles == null) {
+ init();
+ }
+
+ /** If set to true, we throw an IOException if the same
+ * file is opened by createOutput, ever. */
+ public void setPreventDoubleWrite(boolean value) {
+ preventDoubleWrite = value;
+ }
+
+ public synchronized void sync(String name) throws IOException {
+ maybeThrowDeterministicException();
+ if (crashed)
+ throw new IOException("cannot sync after crash");
+ if (unSyncedFiles.contains(name))
+ unSyncedFiles.remove(name);
+ }
+
+ /** Simulates a crash of OS or machine by overwriting
+ * unsycned files. */
+ public void crash() throws IOException {
+ synchronized(this) {
+ crashed = true;
openFiles = new HashMap();
}
+ Iterator it = unSyncedFiles.iterator();
+ unSyncedFiles = new HashSet();
+ int count = 0;
+ while(it.hasNext()) {
+ String name = (String) it.next();
+ RAMFile file = (RAMFile) fileMap.get(name);
+ if (count % 3 == 0) {
+ deleteFile(name, true);
+ } else if (count % 3 == 1) {
+ // Zero out file entirely
+ final int numBuffers = file.numBuffers();
+ for(int i=0;i
When autoCommit is true then
- every flush is also a commit ({@link IndexReader}
- instances will see each flush as changes to the index).
- This is the default, to match the behavior before 2.2.
- When running in this mode, be careful not to refresh your
+ the writer will periodically commit on its own. This is
+ the default, to match the behavior before 2.2. There is
+ no guarantee when exactly an auto commit will occur (it
+ used to be after every flush, but it is now after every
+ completed merge, as of 2.4). If you want to force a
+ commit, call {@link #commit}, or, close the writer. Once
+ a commit has finished, ({@link IndexReader} instances will
+ see the changes to the index as of that commit. When
+ running in this mode, be careful not to refresh your
readers while optimize or segment merges are taking place
as this can tie up substantial disk space.
autoCommit=false, flushed data would still
- * not be visible to readers, until {@link #close} is called.
+ * Note: while this will force buffered docs to be
+ * pushed into the index, it will not make these docs
+ * visible to a reader. Use {@link #commit} instead
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
+ * @deprecated please call {@link #commmit}) instead
*/
public final void flush() throws CorruptIndexException, IOException {
flush(true, false);
}
/**
+ * Commits all pending updates (added & deleted documents)
+ * to the index, and syncs all referenced index files,
+ * such that a reader will see the changes. Note that
+ * this does not wait for any running background merges to
+ * finish.
+ */
+ public final void commit() throws CorruptIndexException, IOException {
+ commit(true);
+ }
+
+ private final void commit(boolean triggerMerges) throws CorruptIndexException, IOException {
+ flush(triggerMerges, true);
+ sync(true, 0);
+ }
+
+ /**
* Flush all in-memory buffered udpates (adds and deletes)
* to the Directory.
* @param triggerMerge if true, we may merge segments (if
@@ -2671,10 +2720,15 @@
maybeMerge();
}
+ // TODO: this method should not have to be entirely
+ // synchronized, ie, merges should be allowed to commit
+ // even while a flush is happening
private synchronized final boolean doFlush(boolean flushDocStores) throws CorruptIndexException, IOException {
// Make sure no threads are actively adding a document
+ flushCount++;
+
// Returns true if docWriter is currently aborting, in
// which case we skip flushing this segment
if (docWriter.pauseAllThreads()) {
@@ -2707,10 +2761,18 @@
// apply to more than just the last flushed segment
boolean flushDeletes = docWriter.hasDeletes();
+ int docStoreOffset = docWriter.getDocStoreOffset();
+
+ // docStoreOffset should only be non-zero when
+ // autoCommit == false
+ assert !autoCommit || 0 == docStoreOffset;
+
+ boolean docStoreIsCompoundFile = false;
+
if (infoStream != null) {
message(" flush: segment=" + docWriter.getSegment() +
" docStoreSegment=" + docWriter.getDocStoreSegment() +
- " docStoreOffset=" + docWriter.getDocStoreOffset() +
+ " docStoreOffset=" + docStoreOffset +
" flushDocs=" + flushDocs +
" flushDeletes=" + flushDeletes +
" flushDocStores=" + flushDocStores +
@@ -2719,14 +2781,6 @@
message(" index before flush " + segString());
}
- int docStoreOffset = docWriter.getDocStoreOffset();
-
- // docStoreOffset should only be non-zero when
- // autoCommit == false
- assert !autoCommit || 0 == docStoreOffset;
-
- boolean docStoreIsCompoundFile = false;
-
// Check if the doc stores must be separately flushed
// because other segments, besides the one we are about
// to flush, reference it
@@ -2744,60 +2798,63 @@
// If we are flushing docs, segment must not be null:
assert segment != null || !flushDocs;
- if (flushDocs || flushDeletes) {
+ if (flushDocs) {
- SegmentInfos rollback = null;
-
- if (flushDeletes)
- rollback = (SegmentInfos) segmentInfos.clone();
-
boolean success = false;
+ final int flushedDocCount;
try {
- if (flushDocs) {
+ flushedDocCount = docWriter.flush(flushDocStores);
+ success = true;
+ } finally {
+ if (!success) {
+ if (infoStream != null)
+ message("hit exception flushing segment " + segment);
+ docWriter.abort(null);
+ deleter.refresh(segment);
+ }
+ }
+
+ if (0 == docStoreOffset && flushDocStores) {
+ // This means we are flushing private doc stores
+ // with this segment, so it will not be shared
+ // with other segments
+ assert docStoreSegment != null;
+ assert docStoreSegment.equals(segment);
+ docStoreOffset = -1;
+ docStoreIsCompoundFile = false;
+ docStoreSegment = null;
+ }
- if (0 == docStoreOffset && flushDocStores) {
- // This means we are flushing private doc stores
- // with this segment, so it will not be shared
- // with other segments
- assert docStoreSegment != null;
- assert docStoreSegment.equals(segment);
- docStoreOffset = -1;
- docStoreIsCompoundFile = false;
- docStoreSegment = null;
- }
+ // Create new SegmentInfo, but do not add to our
+ // segmentInfos until deletes are flushed
+ // successfully.
+ newSegment = new SegmentInfo(segment,
+ flushedDocCount,
+ directory, false, true,
+ docStoreOffset, docStoreSegment,
+ docStoreIsCompoundFile);
+ }
- int flushedDocCount = docWriter.flush(flushDocStores);
-
- newSegment = new SegmentInfo(segment,
- flushedDocCount,
- directory, false, true,
- docStoreOffset, docStoreSegment,
- docStoreIsCompoundFile);
- segmentInfos.addElement(newSegment);
- }
+ if (flushDeletes) {
+ try {
+ SegmentInfos rollback = (SegmentInfos) segmentInfos.clone();
- if (flushDeletes) {
+ boolean success = false;
+ try {
// we should be able to change this so we can
// buffer deletes longer and then flush them to
- // multiple flushed segments, when
- // autoCommit=false
- applyDeletes(flushDocs);
- doAfterFlush();
- }
-
- checkpoint();
- success = true;
- } finally {
- if (!success) {
-
- if (infoStream != null)
- message("hit exception flushing segment " + segment);
+ // multiple flushed segments only when a commit()
+ // finally happens
+ applyDeletes(newSegment);
+ success = true;
+ } finally {
+ if (!success) {
+ if (infoStream != null)
+ message("hit exception flushing deletes");
- if (flushDeletes) {
-
- // Carefully check if any partial .del files
- // should be removed:
+ // Carefully remove any partially written .del
+ // files
final int size = rollback.size();
for(int i=0;i
- 2.3 and above: + 2.3: Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField, NormGenNumField, IsCompoundFile>SegCount
++ 2.4 and above: + Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField, + NormGenNumField, + IsCompoundFile>SegCount, Checksum +
Format, NameCounter, SegCount, SegSize, NumField, DocStoreOffset --> Int32
- Version, DelGen, NormGen --> Int64 + Version, DelGen, NormGen, Checksum --> Int64
@@ -842,7 +848,7 @@
- Format is -1 as of Lucene 1.4, -3 (SegmentInfos.FORMAT_SINGLE_NORM_FILE) as of Lucene 2.1 and 2.2, and -4 (SegmentInfos.FORMAT_SHARED_DOC_STORE) as of Lucene 2.3 + Format is -1 as of Lucene 1.4, -3 (SegmentInfos.FORMAT_SINGLE_NORM_FILE) as of Lucene 2.1 and 2.2, -4 (SegmentInfos.FORMAT_SHARED_DOC_STORE) as of Lucene 2.3 and -5 (SegmentInfos.FORMAT_CHECKSUM) as of Lucene 2.4.
@@ -925,6 +931,13 @@ shares a single set of these files with other segments.
+ ++ Checksum contains the CRC32 checksum of all bytes + in the segments_N file up until the checksum. + This is used to verify integrity of the file on + opening the index. +
Index: docs/fileformats.pdf =================================================================== --- docs/fileformats.pdf (revision 618010) +++ docs/fileformats.pdf (working copy) @@ -5,10 +5,10 @@ /Producer (FOP 0.20.5) >> endobj 5 0 obj -<< /Length 1115 /Filter [ /ASCII85Decode /FlateDecode ] +<< /Length 1113 /Filter [ /ASCII85Decode /FlateDecode ] >> stream -Gb!$G9lo#B&;KZO$6@53W]k9ICdOP`P=a5[dnAEt!C8gORi4Z:^TSn%I4u(M/f6Qu5V)`b?+hcW?/#04U4=qR5W\?WoeGhWYioMGj;W_>r>%*jBf#hS$N07??;IG:iWe2$GTd%P5A[5AGK.,clStMnIs*foQHm-?;6D7rjp(_fkuW9P8UVE3V0PI;7%6iam]H;hfIlOSITofT^+bJa!4,V)0b+f8okNaP[D!`crot;@qgDZ/Q,oMcirC1<3fAq1kT06JC`"7_RH?q]RN6mO`.&T.a?=N_M^`] egiiN5_lg%pQ$ki+5=e.cK'FATW!IFLM!RK^9@YkB>7c'TBcK:mT.RP(Mf-@5$SZFeH/.qOP8[CoK)io(%@c:lSb3O2BbV*kQ.5V;hA/k^3n[[Y*ugbok\A!W9pbiUa<#SI*r%'kEe"A5b)h$J1fY:qTN/A8ZWP@;.2sQH*BO9?RQ4o?dl(_&eE7@qRJ<6Ya+eY(TVVaQ%FO.`'.!Ri+K^`:31ld!Nd5iq]W5QOq=f.eq(V)Uk6r8-.gY$-^8Z9Zfcn;as,G^"!SdqT[dZ"JY_lae)gr$22\D8&P@BB*(3ER:t[/cOcf5l\f^s%$d+EPY7T<=jb #62Oo3,<#ES>HV-g#@Z?r.
-2.3 and above:
+2.3:
Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField,
NormGenNumField,
IsCompoundFile>SegCount
+
+2.4 and above:
+ Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField,
+ NormGenNumField,
+ IsCompoundFile>SegCount, Checksum
+
Format, NameCounter, SegCount, SegSize, NumField, DocStoreOffset --> Int32
- Version, DelGen, NormGen --> Int64
+ Version, DelGen, NormGen, Checksum --> Int64
SegName, DocStoreSegment --> String
@@ -1335,7 +1342,7 @@
IsCompoundFile, HasSingleNormFile, DocStoreIsCompoundFile --> Int8
- Format is -1 as of Lucene 1.4, -3 (SegmentInfos.FORMAT_SINGLE_NORM_FILE) as of Lucene 2.1 and 2.2, and -4 (SegmentInfos.FORMAT_SHARED_DOC_STORE) as of Lucene 2.3
+ Format is -1 as of Lucene 1.4, -3 (SegmentInfos.FORMAT_SINGLE_NORM_FILE) as of Lucene 2.1 and 2.2, -4 (SegmentInfos.FORMAT_SHARED_DOC_STORE) as of Lucene 2.3 and -5 (SegmentInfos.FORMAT_CHECKSUM) as of Lucene 2.4.
Version counts how often the index has been
@@ -1408,7 +1415,13 @@
shares a single set of these files with other
segments.
+ Checksum contains the CRC32 checksum of all bytes
+ in the segments_N file up until the checksum.
+ This is used to verify integrity of the file on
+ opening the index.
+
The write lock, which is stored in the index
@@ -1426,7 +1439,7 @@
Note that prior to version 2.1, Lucene also used a
commit lock. This was removed in 2.1.
Prior to Lucene 2.1 there was a file "deletable"
@@ -1435,7 +1448,7 @@
the files that are deletable, instead, so no file
is written.
Starting with Lucene 1.4 the compound file format became default. This
is simply a container for all files described in the next section
@@ -1462,14 +1475,14 @@
-
+
The remaining files are all per-segment, and are
thus defined by suffix.
The term dictionary is represented as two files:
@@ -1874,7 +1887,7 @@
-
+
The .frq file contains the lists of documents
@@ -1992,7 +2005,7 @@
entry in level-1. In the example has entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a pointer
to entry 31 on level 0.
The .prx file contains the lists of positions that
@@ -2058,7 +2071,7 @@
Payload. If PayloadLength is not stored, then this Payload has the same
length as the Payload at the previous position.
@@ -2162,7 +2175,7 @@
2.1 and above:
Separate norm files are created (when adequate) for both compound and non compound segments.
Term Vector support is an optional on a field by
@@ -2295,7 +2308,7 @@
-
+
The .del file is
optional, and only exists when a segment contains deletions.
@@ -2367,7 +2380,7 @@
There
qTKk?uSmmI``Kop/SeGdSrf+;`nlhX)tN[$-#.]sK_7XTZ(*g2%0poSD[jf8bGfl8uAq`W/;1$S+qV@bfKb0CBh\dBX`8E!MUBVdN!2*iEU;!OsYo^#O\R6gt.*<)aaI[ot"O'B0+0g_AX8=eje*Tn>Z(1;7Lg!!3L4270;)hF0:Qr[?`hR*9m8hm;kUX%(0ZFRt)c:r!i!3T+4f'Sf6!9%KXXOTm+Xp^p4(Kn@oX_'N9i"tNM0W45P=$T(et#tGjR;AG0s-NA+Fj:k
Sonq6=u?"/GSUE<~>
endstream
endobj
86 0 obj
@@ -499,10 +499,10 @@
>>
endobj
87 0 obj
-<< /Length 1440 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1291 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
-Gatm;D/\/e&H;*)Tl3SSKFoY201-=ZDW\f'2Pc8_%pLqW[V8sE/@cIc35^2m:Qf9126%P"+E6[/na;uqj_6grrAYaE),Y;]T+@^8-b^CO+oLMtrd?3s8sflH-,'-
Y)Tf]*Bb!joZRnZi*,,^9\RY3-F=]VMj&l`]Y2d/gV2Tui7X`UAFm(e)FFdn`0&cOXcckMHb=:R8c\%h^,-5il&W6d
Lock File
Deletable File
Compound Files
Per-Segment Files
Fields
Term Dictionary
Frequencies
Positions
Normalization Factors
Term Vectors
Deleted Documents
Limitations