Index: lucene/common-build.xml --- lucene/common-build.xml Thu Dec 09 13:13:27 2010 -0500 +++ lucene/common-build.xml Thu Dec 09 13:14:33 2010 -0500 @@ -68,6 +68,7 @@ + @@ -459,6 +460,8 @@ + + Index: lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py --- lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py Thu Dec 09 13:13:27 2010 -0500 +++ lucene/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.py Thu Dec 09 13:14:33 2010 -0500 @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import types import os import sys Index: lucene/src/test/org/apache/lucene/index/TestNRTThreads.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/test/org/apache/lucene/index/TestNRTThreads.java Thu Dec 09 13:14:33 2010 -0500 @@ -0,0 +1,336 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.MockDirectoryWrapper; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LineFileDocs; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; +import org.junit.Test; + +import static org.junit.Assert.*; +import static org.junit.Assume.*; + +// TODO +// - mix in optimize, addIndexes + +public class TestNRTThreads extends LuceneTestCase { + + @Test + public void testNRTThreads() throws Exception { + + final long t0 = System.currentTimeMillis(); + + if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) { + // no + CodecProvider.getDefault().setDefaultFieldCodec("Standard"); + } + + final LineFileDocs docs = new LineFileDocs(true); + final File tempDir = _TestUtil.getTempDir("nrtopenfiles"); + final MockDirectoryWrapper dir = new MockDirectoryWrapper(random, FSDirectory.open(tempDir)); + final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()); + conf.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() { + @Override + public void warm(IndexReader reader) throws IOException { + if (VERBOSE) { + System.out.println("TEST: now warm merged reader=" + reader); + } + final int maxDoc = reader.maxDoc(); + final Bits delDocs = reader.getDeletedDocs(); + int sum = 0; + final int inc = Math.max(1, maxDoc/50); + for(int docID=0;docID 5) { + lmp.setMergeFactor(5); + } + + final int NUM_INDEX_THREADS = 2; + final int NUM_SEARCH_THREADS = 3; + final int RUN_TIME_SEC = LuceneTestCase.TEST_NIGHTLY ? 300 : 5; + + final AtomicBoolean failed = new AtomicBoolean(); + final AtomicInteger addCount = new AtomicInteger(); + final AtomicInteger delCount = new AtomicInteger(); + final long stopTime = System.currentTimeMillis() + RUN_TIME_SEC*1000; + Thread[] threads = new Thread[NUM_INDEX_THREADS]; + for(int thread=0;thread toDeleteIDs = new ArrayList(); + while(System.currentTimeMillis() < stopTime && !failed.get()) { + try { + Document doc = docs.nextDoc(); + if (doc == null) { + break; + } + if (random.nextBoolean()) { + if (VERBOSE) { + //System.out.println(Thread.currentThread().getName() + ": add doc id:" + doc.get("id")); + } + writer.addDocument(doc); + } else { + // we use update but it never replaces a + // prior doc + if (VERBOSE) { + //System.out.println(Thread.currentThread().getName() + ": update doc id:" + doc.get("id")); + } + writer.updateDocument(new Term("id", doc.get("id")), doc); + } + if (random.nextInt(5) == 3) { + if (VERBOSE) { + //System.out.println(Thread.currentThread().getName() + ": buffer del id:" + doc.get("id")); + } + toDeleteIDs.add(doc.get("id")); + } + if (random.nextInt(50) == 17) { + if (VERBOSE) { + System.out.println(Thread.currentThread().getName() + ": apply " + toDeleteIDs.size() + " deletes"); + } + for(String id : toDeleteIDs) { + writer.deleteDocuments(new Term("id", id)); + } + delCount.addAndGet(toDeleteIDs.size()); + toDeleteIDs.clear(); + } + addCount.getAndIncrement(); + } catch (Exception exc) { + System.out.println(Thread.currentThread().getName() + ": hit exc"); + exc.printStackTrace(); + failed.set(true); + throw new RuntimeException(exc); + } + } + } + }; + threads[thread].setDaemon(true); + threads[thread].start(); + } + + if (VERBOSE) { + System.out.println("TEST: DONE start indexing threads [" + (System.currentTimeMillis()-t0) + " ms]"); + } + + // let index build up a bit + Thread.sleep(100); + + IndexReader r = IndexReader.open(writer); + boolean any = false; + + // silly starting guess: + final AtomicInteger totTermCount = new AtomicInteger(100); + + while(System.currentTimeMillis() < stopTime && !failed.get()) { + if (random.nextBoolean()) { + if (VERBOSE) { + System.out.println("TEST: now reopen r=" + r); + } + final IndexReader r2 = r.reopen(); + if (r != r2) { + r.close(); + r = r2; + } + } else { + if (VERBOSE) { + System.out.println("TEST: now close reader=" + r); + } + r.close(); + writer.commit(); + final Set openDeletedFiles = dir.getOpenDeletedFiles(); + if (openDeletedFiles.size() > 0) { + System.out.println("OBD files: " + openDeletedFiles); + } + any |= openDeletedFiles.size() > 0; + //assertEquals("open but deleted: " + openDeletedFiles, 0, openDeletedFiles.size()); + if (VERBOSE) { + System.out.println("TEST: now open"); + } + r = IndexReader.open(writer); + } + if (VERBOSE) { + System.out.println("TEST: got new reader=" + r); + } + //System.out.println("numDocs=" + r.numDocs() + " + //openDelFileCount=" + dir.openDeleteFileCount()); + + smokeTestReader(r); + + final IndexSearcher s = new IndexSearcher(r); + + // run search threads + final long searchStopTime = System.currentTimeMillis() + 500; + final Thread[] searchThreads = new Thread[NUM_SEARCH_THREADS]; + final AtomicInteger totHits = new AtomicInteger(); + for(int thread=0;thread 0; + + assertFalse("saw non-zero open-but-deleted count", any); + if (VERBOSE) { + System.out.println("TEST: now join"); + } + for(int thread=0;thread threadDocs = new ThreadLocal(); + + // Document instance is re-used per-thread + public Document nextDoc() throws IOException { + String line; + synchronized(this) { + line = reader.readLine(); + if (line == null) { + if (forever) { + if (LuceneTestCase.VERBOSE) { + System.out.println("TEST: LineFileDocs: now rewind file..."); + } + close(); + open(); + line = reader.readLine(); + } + return null; + } + } + + DocState docState = threadDocs.get(); + if (docState == null) { + docState = new DocState(); + threadDocs.set(docState); + } + + int spot = line.indexOf(SEP); + if (spot == -1) { + throw new RuntimeException("line: [" + line + "] is in an invalid format !"); + } + int spot2 = line.indexOf(SEP, 1 + spot); + if (spot2 == -1) { + throw new RuntimeException("line: [" + line + "] is in an invalid format !"); + } + + docState.body.setValue(line.substring(1+spot2, line.length())); + final String title = line.substring(0, spot); + docState.title.setValue(title); + docState.titleTokenized.setValue(title); + docState.date.setValue(line.substring(1+spot, spot2)); + docState.id.setValue(Integer.toString(id.getAndIncrement())); + return docState.doc; + } +} Index: lucene/src/test/org/apache/lucene/util/LuceneTestCase.java --- lucene/src/test/org/apache/lucene/util/LuceneTestCase.java Thu Dec 09 13:13:27 2010 -0500 +++ lucene/src/test/org/apache/lucene/util/LuceneTestCase.java Thu Dec 09 13:14:33 2010 -0500 @@ -128,19 +128,21 @@ // each test case (non-J4 tests) and each test class (J4 // tests) /** Gets the codec to run tests with. */ - static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField"); + public static final String TEST_CODEC = System.getProperty("tests.codec", "randomPerField"); /** Gets the locale to run tests with */ - static final String TEST_LOCALE = System.getProperty("tests.locale", "random"); + public static final String TEST_LOCALE = System.getProperty("tests.locale", "random"); /** Gets the timezone to run tests with */ - static final String TEST_TIMEZONE = System.getProperty("tests.timezone", "random"); + public static final String TEST_TIMEZONE = System.getProperty("tests.timezone", "random"); /** Gets the directory to run tests with */ - static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random"); + public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random"); /** Get the number of times to run tests */ - static final int TEST_ITER = Integer.parseInt(System.getProperty("tests.iter", "1")); + public static final int TEST_ITER = Integer.parseInt(System.getProperty("tests.iter", "1")); /** Get the random seed for tests */ - static final String TEST_SEED = System.getProperty("tests.seed", "random"); + public static final String TEST_SEED = System.getProperty("tests.seed", "random"); /** whether or not nightly tests should run */ - static final boolean TEST_NIGHTLY = Boolean.parseBoolean(System.getProperty("tests.nightly", "false")); + public static final boolean TEST_NIGHTLY = Boolean.parseBoolean(System.getProperty("tests.nightly", "false")); + /** the line file used by LineFileDocs */ + public static final String TEST_LINE_DOCS_FILE = System.getProperty("tests.linedocsfile", "europarl.lines.txt.gz"); private static final Pattern codecWithParam = Pattern.compile("(.*)\\(\\s*(\\d+)\\s*\\)"); Index: lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz Binary file lucene/src/test/org/apache/lucene/util/europarl.lines.txt.gz has changed Index: lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ lucene/src/test/org/apache/lucene/util/makeEuroparlLineFile.py Thu Dec 09 13:14:33 2010 -0500 @@ -0,0 +1,137 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import glob +import datetime +import tarfile +import re + +try: + sys.argv.remove('-verbose') + VERBOSE = True +except ValueError: + VERBOSE = False + +try: + sys.argv.remove('-docPerParagraph') + docPerParagraph = True +except ValueError: + docPerParagraph = False + +reChapterOnly = re.compile('^$') +reTagOnly = re.compile('^<.*?>$') +reNumberOnly = re.compile(r'^\d+\.?$') + +docCount = 0 +didEnglish = False + +def write(date, title, pending, fOut): + global docCount + body = ' '.join(pending).replace('\t', ' ').strip() + if len(body) > 0: + line = '%s\t%s\t%s\n' % (title, date, body) + fOut.write(line) + docCount += 1 + del pending[:] + if VERBOSE: + print len(body) + +def processTar(fileName, fOut): + + global didEnglish + + t = tarfile.open(fileName, 'r:gz') + for ti in t: + if ti.isfile() and (not didEnglish or ti.name.find('/en/') == -1): + + tup = ti.name.split('/') + lang = tup[1] + year = int(tup[2][3:5]) + if year < 20: + year += 2000 + else: + year += 1900 + + month = int(tup[2][6:8]) + day = int(tup[2][9:11]) + date = datetime.date(year=year, month=month, day=day) + + if VERBOSE: + print + print '%s: %s' % (ti.name, date) + nextIsTitle = False + title = None + pending = [] + for line in t.extractfile(ti).readlines(): + line = line.strip() + if reChapterOnly.match(line) is not None: + if title is not None: + write(date, title, pending, fOut) + nextIsTitle = True + continue + if nextIsTitle: + if not reNumberOnly.match(line) and not reTagOnly.match(line): + title = line + nextIsTitle = False + if VERBOSE: + print ' title %s' % line + continue + if line.lower() == '

': + if docPerParagraph: + write(date, title, pending, fOut) + else: + pending.append('PARSEP') + elif not reTagOnly.match(line): + pending.append(line) + if title is not None and len(pending) > 0: + write(date, title, pending, fOut) + + didEnglish = True + +# '/x/lucene/data/europarl/all.lines.txt' +dirIn = sys.argv[1] +fileOut = sys.argv[2] + +fOut = open(fileOut, 'wb') + +for fileName in glob.glob('%s/??-??.tgz' % dirIn): + if fileName.endswith('.tgz'): + print 'process %s; %d docs so far...' % (fileName, docCount) + processTar(fileName, fOut) + +print 'TOTAL: %s' % docCount + +#run something like this: +""" + +# Europarl V5 makes 76,917 docs, avg 38.6 KB per +python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt +shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt +rm /x/lucene/data/europarl/tmp.lines.txt + +# Run again, this time each paragraph is a doc: +# Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per: +python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph +shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt +rm /x/lucene/data/europarl/tmp.lines.txt + +# ~5.5 MB gzip'd: +head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt +head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt +shuf tmp.txt > europarl.subset.txt +rm -f tmp.txt +gzip --best europarl.subset.txt +"""