Index: src/test/org/apache/lucene/index/TestIndexWriter.java =================================================================== --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 607827) +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) @@ -538,8 +538,7 @@ } /** - * Make sure we get a friendly exception for a wicked - * long term. + * Make sure we skip wicked long terms. */ public void testWickedLongTerm() throws IOException { RAMDirectory dir = new RAMDirectory(); @@ -552,13 +551,9 @@ // Max length term is 16383, so this contents produces // a too-long term: - String contents = "abc xyz x" + bigTerm; + String contents = "abc xyz x" + bigTerm + " another term"; doc.add(new Field("content", contents, Field.Store.NO, Field.Index.TOKENIZED)); - try { - writer.addDocument(doc); - fail("did not hit expected exception"); - } catch (IllegalArgumentException e) { - } + writer.addDocument(doc); // Make sure we can add another normal document doc = new Document(); @@ -567,9 +562,24 @@ writer.close(); IndexReader reader = IndexReader.open(dir); + // Make sure all terms < max size were indexed assertEquals(2, reader.docFreq(new Term("content", "abc"))); assertEquals(1, reader.docFreq(new Term("content", "bbb"))); + assertEquals(1, reader.docFreq(new Term("content", "term"))); + assertEquals(1, reader.docFreq(new Term("content", "another"))); + + // Make sure position is still incremented when + // massive term is skipped: + TermPositions tps = reader.termPositions(new Term("content", "another")); + assertTrue(tps.next()); + assertEquals(1, tps.freq()); + assertEquals(3, tps.nextPosition()); + + // Make sure the doc that has the massive term is in + // the index: + assertEquals("failed document should not be in the index",2,reader.numDocs()); + reader.close(); // Make sure we can add a document with exactly the @@ -1789,7 +1799,18 @@ writer.close(); IndexReader reader = IndexReader.open(dir); - assertEquals(reader.docFreq(new Term("content", "aa")), 3); + final Term t = new Term("content", "aa"); + assertEquals(reader.docFreq(t), 3); + + // Make sure the doc that hit the exception was marked + // as deleted: + TermDocs tdocs = reader.termDocs(t); + int count = 0; + while(tdocs.next()) { + count++; + } + assertEquals(2, count); + assertEquals(reader.docFreq(new Term("content", "gg")), 0); reader.close(); dir.close(); @@ -1905,12 +1926,18 @@ int expected = 3+(1-i)*2; assertEquals(expected, reader.docFreq(new Term("contents", "here"))); assertEquals(expected, reader.maxDoc()); + int numDel = 0; for(int j=0;j 0 if this doc has a too-large term + String maxTermPrefix; // Non-null prefix of a too-large term if this + // doc has one boolean doFlushAfter; boolean abortOnExc; @@ -623,7 +651,7 @@ numStoredFields = 0; numFieldData = 0; numVectorFields = 0; - maxTermHit = 0; + maxTermPrefix = null; assert 0 == fdtLocal.length(); assert 0 == tvfLocal.length(); @@ -989,6 +1017,9 @@ for(int i=0;i 0.95 * ramBufferSize) balanceRAM(); @@ -1553,11 +1584,17 @@ final int textLen1 = 1+tokenTextLen; if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) { if (textLen1 > CHAR_BLOCK_SIZE) { - maxTermHit = tokenTextLen; - // Just skip this term; we will throw an - // exception after processing all accepted - // terms in the doc + // Just skip this term, to remain as robust as + // possible during indexing. A TokenFilter + // can be inserted into the analyzer chain if + // other behavior is wanted (pruning the term + // to a prefix, throwing an exception, etc). abortOnExc = false; + if (maxTermPrefix == null) + maxTermPrefix = new String(tokenText, 0, 30); + + // Still increment position: + position++; return; } charPool.nextBuffer(); @@ -2267,29 +2304,27 @@ /** Returns true if the caller (IndexWriter) should now * flush. */ - int addDocument(Document doc, Analyzer analyzer) + boolean addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { return updateDocument(doc, analyzer, null); } - int updateDocument(Term t, Document doc, Analyzer analyzer) + boolean updateDocument(Term t, Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { return updateDocument(doc, analyzer, t); } - int updateDocument(Document doc, Analyzer analyzer, Term delTerm) + boolean updateDocument(Document doc, Analyzer analyzer, Term delTerm) throws CorruptIndexException, IOException { // This call is synchronized but fast final ThreadState state = getThreadState(doc, delTerm); boolean success = false; - int maxTermHit; try { try { // This call is not synchronized and does all the work state.processDocument(analyzer); } finally { - maxTermHit = state.maxTermHit; // This call is synchronized but fast finishDocument(state); } @@ -2299,16 +2334,20 @@ synchronized(this) { state.isIdle = true; if (state.abortOnExc) + // Abort all buffered docs since last flush abort(); + else + // Immediately mark this document as deleted + // since likely it was partially added. This + // keeps indexing as "all or none" (atomic) when + // adding a document: + addDeleteDocID(state.docID); notifyAll(); } } } - int status = maxTermHit<<1; - if (state.doFlushAfter || timeToFlushDeletes()) - status += 1; - return status; + return state.doFlushAfter || timeToFlushDeletes(); } synchronized int getNumBufferedDeleteTerms() { @@ -2319,9 +2358,14 @@ return bufferedDeleteTerms; } + synchronized List getBufferedDeleteDocIDs() { + return bufferedDeleteDocIDs; + } + // Reset buffered deletes. - synchronized void clearBufferedDeleteTerms() throws IOException { + synchronized void clearBufferedDeletes() throws IOException { bufferedDeleteTerms.clear(); + bufferedDeleteDocIDs.clear(); numBufferedDeleteTerms = 0; if (numBytesUsed > 0) resetPostingsData(); @@ -2366,7 +2410,7 @@ } synchronized boolean hasDeletes() { - return bufferedDeleteTerms.size() > 0; + return bufferedDeleteTerms.size() > 0 || bufferedDeleteDocIDs.size() > 0; } // Number of documents a delete term applies to. @@ -2414,6 +2458,13 @@ numBufferedDeleteTerms++; } + // Buffer a specific docID for deletion. Currently only + // used when we hit a exception when adding a document + synchronized private void addDeleteDocID(int docId) { + bufferedDeleteDocIDs.add(new Integer(docId)); + numBytesUsed += OBJECT_HEADER_BYTES + BYTES_PER_INT + OBJECT_POINTER_BYTES; + } + /** Does the synchronized work to finish/flush the * inverted document. */ private synchronized void finishDocument(ThreadState state) throws IOException { @@ -2910,6 +2961,8 @@ final static int CHAR_BLOCK_SIZE = (int) Math.pow(2.0, CHAR_BLOCK_SHIFT); final static int CHAR_BLOCK_MASK = CHAR_BLOCK_SIZE - 1; + final static int MAX_TERM_LENGTH = CHAR_BLOCK_SIZE-1; + private ArrayList freeCharBlocks = new ArrayList(); /* Allocate another char[] from the shared pool */ Index: src/java/org/apache/lucene/index/IndexWriter.java =================================================================== --- src/java/org/apache/lucene/index/IndexWriter.java (revision 607827) +++ src/java/org/apache/lucene/index/IndexWriter.java (working copy) @@ -241,6 +241,14 @@ * Default value is 128. Change using {@link #setTermIndexInterval(int)}. */ public final static int DEFAULT_TERM_INDEX_INTERVAL = 128; + + /** + * Absolute hard maximum length for a term. If a term + * arrives from the analyzer longer than this length, it + * is skipped and a message is printed to infoStream, if + * set (see {@link setInfoStream}). + */ + public final static int MAX_TERM_LENGTH = DocumentsWriter.MAX_TERM_LENGTH; // The normal read buffer size defaults to 1024, but // increasing this during merging seems to yield @@ -1431,10 +1439,10 @@ */ public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { ensureOpen(); - int status = 0; + boolean doFlush = false; boolean success = false; try { - status = docWriter.addDocument(doc, analyzer); + doFlush = docWriter.addDocument(doc, analyzer); success = true; } finally { if (!success) { @@ -1453,9 +1461,8 @@ } } } - if ((status & 1) != 0) + if (doFlush) flush(true, false); - checkMaxTermLength(status); } /** @@ -1519,10 +1526,10 @@ public void updateDocument(Term term, Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { ensureOpen(); - int status = 0; + boolean doFlush = false; boolean success = false; try { - status = docWriter.updateDocument(term, doc, analyzer); + doFlush = docWriter.updateDocument(term, doc, analyzer); success = true; } finally { if (!success) { @@ -1539,19 +1546,10 @@ } } } - if ((status & 1) != 0) + if (doFlush) flush(true, false); - checkMaxTermLength(status); } - /** Throws IllegalArgumentException if the return status - * from DocumentsWriter.{add,update}Document indicates - * that a too-long term was encountered */ - final private void checkMaxTermLength(int status) { - if (status > 1) - throw new IllegalArgumentException("at least one term (length " + (status>>1) + ") exceeds max term length " + (DocumentsWriter.CHAR_BLOCK_SIZE-1) + "; these terms were skipped"); - } - // for test purpose final synchronized int getSegmentCount(){ return segmentInfos.size(); @@ -2500,9 +2498,7 @@ // buffer deletes longer and then flush them to // multiple flushed segments, when // autoCommit=false - int delCount = applyDeletes(flushDocs); - if (infoStream != null) - infoStream.println("flushed " + delCount + " deleted documents"); + applyDeletes(flushDocs); doAfterFlush(); } @@ -3220,68 +3216,66 @@ // flushedNewSegment is true then a new segment was just // created and flushed from the ram segments, so we will // selectively apply the deletes to that new segment. - private final int applyDeletes(boolean flushedNewSegment) throws CorruptIndexException, IOException { + private final void applyDeletes(boolean flushedNewSegment) throws CorruptIndexException, IOException { final HashMap bufferedDeleteTerms = docWriter.getBufferedDeleteTerms(); + final List bufferedDeleteDocIDs = docWriter.getBufferedDeleteDocIDs(); int delCount = 0; - if (bufferedDeleteTerms.size() > 0) { - if (infoStream != null) - message("flush " + docWriter.getNumBufferedDeleteTerms() + " buffered deleted terms on " - + segmentInfos.size() + " segments."); + if (infoStream != null) + message("flush " + docWriter.getNumBufferedDeleteTerms() + " buffered deleted terms and " + + bufferedDeleteDocIDs.size() + " deleted docIDs on " + + segmentInfos.size() + " segments."); - if (flushedNewSegment) { - IndexReader reader = null; - try { - // Open readers w/o opening the stored fields / - // vectors because these files may still be held - // open for writing by docWriter - reader = SegmentReader.get(segmentInfos.info(segmentInfos.size() - 1), false); + if (flushedNewSegment) { + IndexReader reader = null; + try { + // Open readers w/o opening the stored fields / + // vectors because these files may still be held + // open for writing by docWriter + reader = SegmentReader.get(segmentInfos.info(segmentInfos.size() - 1), false); - // Apply delete terms to the segment just flushed from ram - // apply appropriately so that a delete term is only applied to - // the documents buffered before it, not those buffered after it. - delCount += applyDeletesSelectively(bufferedDeleteTerms, reader); - } finally { - if (reader != null) { - try { - reader.doCommit(); - } finally { - reader.doClose(); - } + // Apply delete terms to the segment just flushed from ram + // apply appropriately so that a delete term is only applied to + // the documents buffered before it, not those buffered after it. + applyDeletesSelectively(bufferedDeleteTerms, bufferedDeleteDocIDs, reader); + } finally { + if (reader != null) { + try { + reader.doCommit(); + } finally { + reader.doClose(); } } } + } - int infosEnd = segmentInfos.size(); - if (flushedNewSegment) { - infosEnd--; - } + int infosEnd = segmentInfos.size(); + if (flushedNewSegment) { + infosEnd--; + } - for (int i = 0; i < infosEnd; i++) { - IndexReader reader = null; - try { - reader = SegmentReader.get(segmentInfos.info(i), false); + for (int i = 0; i < infosEnd; i++) { + IndexReader reader = null; + try { + reader = SegmentReader.get(segmentInfos.info(i), false); - // Apply delete terms to disk segments - // except the one just flushed from ram. - delCount += applyDeletes(bufferedDeleteTerms, reader); - } finally { - if (reader != null) { - try { - reader.doCommit(); - } finally { - reader.doClose(); - } + // Apply delete terms to disk segments + // except the one just flushed from ram. + applyDeletes(bufferedDeleteTerms, reader); + } finally { + if (reader != null) { + try { + reader.doCommit(); + } finally { + reader.doClose(); } } } - - // Clean up bufferedDeleteTerms. - docWriter.clearBufferedDeleteTerms(); } - return delCount; + // Clean up bufferedDeleteTerms. + docWriter.clearBufferedDeletes(); } // For test purposes. @@ -3297,10 +3291,10 @@ // Apply buffered delete terms to the segment just flushed from ram // apply appropriately so that a delete term is only applied to // the documents buffered before it, not those buffered after it. - private final int applyDeletesSelectively(HashMap deleteTerms, - IndexReader reader) throws CorruptIndexException, IOException { + private final void applyDeletesSelectively(HashMap deleteTerms, List deleteIds, + IndexReader reader) + throws CorruptIndexException, IOException { Iterator iter = deleteTerms.entrySet().iterator(); - int delCount = 0; while (iter.hasNext()) { Entry entry = (Entry) iter.next(); Term term = (Term) entry.getKey(); @@ -3315,14 +3309,16 @@ break; } reader.deleteDocument(doc); - delCount++; } } finally { docs.close(); } } } - return delCount; + + iter = deleteIds.iterator(); + while(iter.hasNext()) + reader.deleteDocument(((Integer) iter.next()).intValue()); } // Apply buffered delete terms to this reader.