Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1091436) +++ lucene/CHANGES.txt (working copy) @@ -25,6 +25,9 @@ indexes, causing existing deletions to be applied on the incoming indexes as well. (Shai Erera, Mike McCandless) +* LUCENE-3024: Index with more than 2.1B terms was hitting AIOOBE. + (Tom Burton-West, Mike McCandless) + Test Cases * LUCENE-3002: added 'tests.iter.min' to control 'tests.iter' by allowing to Index: lucene/src/test/org/apache/lucene/index/Test2BTerms.java =================================================================== --- lucene/src/test/org/apache/lucene/index/Test2BTerms.java (revision 1091436) +++ lucene/src/test/org/apache/lucene/index/Test2BTerms.java (working copy) @@ -19,10 +19,15 @@ import org.apache.lucene.util.*; import org.apache.lucene.store.*; +import org.apache.lucene.search.*; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.document.*; +import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import org.junit.Ignore; // Best to run this test w/ plenty of RAM (because of the @@ -30,12 +35,12 @@ // // ant compile-test // -// java -server -Xmx8g -d64 -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/java -Dlucene.version=4.0-dev -Dtests.directory=SimpleFSDirectory -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms +// java -server -Xmx8g -d64 -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/test-framework:./build/classes/java -Dlucene.version=4.0-dev -Dtests.directory=SimpleFSDirectory -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms // public class Test2BTerms extends LuceneTestCase { - private static final class MyTokenStream extends TokenStream { + private final class MyTokenStream extends TokenStream { private final int tokensPerDoc; private int tokenCount; @@ -43,6 +48,8 @@ private final static int TOKEN_LEN = 5; private final char[] chars; private final byte[] bytes; + public final List savedTerms = new ArrayList(); + private int nextSave; public MyTokenStream(int tokensPerDoc) { super(); @@ -51,6 +58,7 @@ chars = charTerm.resizeBuffer(TOKEN_LEN); charTerm.setLength(TOKEN_LEN); bytes = new byte[2*TOKEN_LEN]; + nextSave = _TestUtil.nextInt(random, 500000, 1000000); } @Override @@ -65,6 +73,10 @@ byteUpto += 2; } tokenCount++; + if (--nextSave == 0) { + savedTerms.add(new String(chars, 0, TOKEN_LEN)); + nextSave = _TestUtil.nextInt(random, 500000, 1000000); + } return true; } @@ -77,46 +89,102 @@ @Ignore("Takes ~4 hours to run on a fast machine!!") public void test2BTerms() throws IOException { - long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000; + final long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000; - int TERMS_PER_DOC = 1000000; + final int TERMS_PER_DOC = _TestUtil.nextInt(random, 100000, 1000000); + List savedTerms = null; + Directory dir = newFSDirectory(_TestUtil.getTempDir("2BTerms")); - IndexWriter w = new IndexWriter( - dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)). - setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH). - setRAMBufferSizeMB(256.0). - setMergeScheduler(new ConcurrentMergeScheduler()). - setMergePolicy(newLogMergePolicy(false, 10)) - ); + //Directory dir = newFSDirectory(new File("/p/lucene/indices/2bindex")); + if (true) { + IndexWriter w = new IndexWriter(dir, + new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)) + .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) + .setRAMBufferSizeMB(256.0) + .setMergeScheduler(new ConcurrentMergeScheduler()) + .setMergePolicy(newLogMergePolicy(false, 10))); - MergePolicy mp = w.getConfig().getMergePolicy(); - if (mp instanceof LogByteSizeMergePolicy) { - // 1 petabyte: - ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024); + MergePolicy mp = w.getConfig().getMergePolicy(); + if (mp instanceof LogByteSizeMergePolicy) { + // 1 petabyte: + ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024); + } + + Document doc = new Document(); + final MyTokenStream ts = new MyTokenStream(TERMS_PER_DOC); + Field field = new Field("field", ts); + field.setOmitTermFreqAndPositions(true); + field.setOmitNorms(true); + doc.add(field); + //w.setInfoStream(System.out); + final int numDocs = (int) (TERM_COUNT/TERMS_PER_DOC); + + System.out.println("TERMS_PER_DOC=" + TERMS_PER_DOC); + System.out.println("numDocs=" + numDocs); + + for(int i=0;i bigOrdTerms = new ArrayList(savedTerms.subList(numSavedTerms-10, numSavedTerms)); + System.out.println("TEST: test big ord terms..."); + testSavedTerms(r, bigOrdTerms); + System.out.println("TEST: test all saved terms..."); + testSavedTerms(r, savedTerms); + r.close(); - System.out.println("now CheckIndex..."); + System.out.println("TEST: now CheckIndex..."); CheckIndex.Status status = _TestUtil.checkIndex(dir); final long tc = status.segmentInfos.get(0).termIndexStatus.termCount; assertTrue("count " + tc + " is not > " + Integer.MAX_VALUE, tc > Integer.MAX_VALUE); dir.close(); } + + private List findTerms(IndexReader r) throws IOException { + System.out.println("TEST: findTerms"); + final TermEnum termEnum = r.terms(); + final List savedTerms = new ArrayList(); + int nextSave = _TestUtil.nextInt(random, 500000, 1000000); + while(termEnum.next()) { + if (--nextSave == 0) { + savedTerms.add(termEnum.term().text()); + System.out.println("TEST: add " + termEnum.term()); + nextSave = _TestUtil.nextInt(random, 500000, 1000000); + } + } + return savedTerms; + } + + private void testSavedTerms(IndexReader r, List terms) throws IOException { + System.out.println("TEST: run " + terms.size() + " terms on reader=" + r); + IndexSearcher s = new IndexSearcher(r); + Collections.shuffle(terms); + for(int iter=0;iter<10*terms.size();iter++) { + final String term = terms.get(random.nextInt(terms.size())); + System.out.println("TEST: search " + term); + final long t0 = System.currentTimeMillis(); + assertTrue(s.search(new TermQuery(new Term("field", term)), 1).totalHits > 0); + final long t1 = System.currentTimeMillis(); + System.out.println(" took " + (t1-t0) + " millis"); + + final TermEnum termEnum = r.terms(new Term("field", term)); + assertEquals(term, termEnum.term().text()); + } + } } Index: lucene/src/java/org/apache/lucene/index/CheckIndex.java =================================================================== --- lucene/src/java/org/apache/lucene/index/CheckIndex.java (revision 1091436) +++ lucene/src/java/org/apache/lucene/index/CheckIndex.java (working copy) @@ -706,11 +706,11 @@ throw new RuntimeException("term " + term + " docFreq=" + docFreq + " != num docs seen " + freq0 + " + num docs deleted " + delCount); } + } - // Test search on last term: - if (lastTerm != null) { - is.search(new TermQuery(lastTerm), 1); - } + // Test search on last term: + if (lastTerm != null) { + is.search(new TermQuery(lastTerm), 1); } msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]"); Index: lucene/src/java/org/apache/lucene/index/TermInfosReader.java =================================================================== --- lucene/src/java/org/apache/lucene/index/TermInfosReader.java (revision 1091436) +++ lucene/src/java/org/apache/lucene/index/TermInfosReader.java (working copy) @@ -46,8 +46,8 @@ // Just adds term's ord to TermInfo private final static class TermInfoAndOrd extends TermInfo { - final int termOrd; - public TermInfoAndOrd(TermInfo ti, int termOrd) { + final long termOrd; + public TermInfoAndOrd(TermInfo ti, long termOrd) { super(ti); this.termOrd = termOrd; } @@ -245,7 +245,7 @@ // wipe out the cache when they iterate over a large numbers // of terms in order if (tiOrd == null) { - termsCache.put(cacheKey, new TermInfoAndOrd(ti, (int) enumerator.position)); + termsCache.put(cacheKey, new TermInfoAndOrd(ti, enumerator.position)); } else { assert sameTermInfo(ti, tiOrd, enumerator); assert (int) enumerator.position == tiOrd.termOrd; @@ -262,7 +262,7 @@ // random-access: must seek final int indexPos; if (tiOrd != null) { - indexPos = tiOrd.termOrd / totalIndexInterval; + indexPos = (int) (tiOrd.termOrd / totalIndexInterval); } else { // Must do binary search: indexPos = getIndexOffset(term); @@ -274,10 +274,10 @@ if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (tiOrd == null) { - termsCache.put(cacheKey, new TermInfoAndOrd(ti, (int) enumerator.position)); + termsCache.put(cacheKey, new TermInfoAndOrd(ti, enumerator.position)); } else { assert sameTermInfo(ti, tiOrd, enumerator); - assert (int) enumerator.position == tiOrd.termOrd; + assert enumerator.position == tiOrd.termOrd; } } else { ti = null;