Index: lucene/src/java/org/apache/lucene/util/NumericUtils.java =================================================================== --- lucene/src/java/org/apache/lucene/util/NumericUtils.java (revision 1051154) +++ lucene/src/java/org/apache/lucene/util/NumericUtils.java (working copy) @@ -172,7 +172,7 @@ public static int getPrefixCodedLongShift(final BytesRef val) { final int shift = val.bytes[val.offset] - SHIFT_START_LONG; if (shift > 63 || shift < 0) - throw new NumberFormatException("Invalid shift value in prefixCoded bytes (is encoded value really an INT?)"); + throw new NumberFormatException("Invalid shift value (" + shift + ") in prefixCoded bytes (is encoded value really an INT?)"); return shift; } Index: lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java =================================================================== --- lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (revision 1051154) +++ lucene/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (working copy) @@ -475,8 +475,9 @@ FieldsEnum fields = MultiFields.getFields(reader).iterator(); String fieldName = null; while((fieldName = fields.next()) != null) { - if (fieldName == DocMaker.ID_FIELD) + if (fieldName == DocMaker.ID_FIELD || fieldName == DocMaker.DATE_MSEC_FIELD || fieldName == DocMaker.TIME_SEC_FIELD) { continue; + } TermsEnum terms = fields.terms(); DocsEnum docs = null; while(terms.next() != null) { Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (revision 1051154) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java (working copy) @@ -26,6 +26,7 @@ import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.document.NumericField; /** * Simple task to test performance of tokenizers. It just @@ -67,7 +68,7 @@ Analyzer analyzer = getRunData().getAnalyzer(); int tokenCount = 0; for(final Fieldable field : fields) { - if (!field.isTokenized()) continue; + if (!field.isTokenized() || field instanceof NumericField) continue; final TokenStream stream; final TokenStream streamValue = field.tokenStreamValue(); Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java (revision 1051154) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocData.java (working copy) @@ -29,6 +29,7 @@ private String body; private String title; private String date; + private int id; private Properties props; public void clear() { @@ -37,6 +38,7 @@ title = null; date = null; props = null; + id = -1; } public String getBody() { @@ -57,6 +59,10 @@ return name; } + public int getID() { + return id; + } + public Properties getProps() { return props; } @@ -85,6 +91,10 @@ this.name = name; } + public void setID(int id) { + this.id = id; + } + public void setProps(Properties props) { this.props = props; } Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (revision 1051154) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/DocMaker.java (working copy) @@ -20,14 +20,21 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.HashMap; +import java.util.Calendar; import java.util.Map; import java.util.Properties; +import java.util.Locale; import java.util.Random; +import java.util.Date; +import java.util.concurrent.atomic.AtomicInteger; +import java.text.SimpleDateFormat; +import java.text.ParsePosition; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Format; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; @@ -82,6 +89,7 @@ static class DocState { private final Map fields; + private final Map numericFields; private final boolean reuseFields; final Document doc; DocData docData = new DocData(); @@ -92,6 +100,7 @@ if (reuseFields) { fields = new HashMap(); + numericFields = new HashMap(); // Initialize the map with the default fields. fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyStore, bodyIndex, termVector)); @@ -99,9 +108,13 @@ fields.put(DATE_FIELD, new Field(DATE_FIELD, "", store, index, termVector)); fields.put(ID_FIELD, new Field(ID_FIELD, "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); fields.put(NAME_FIELD, new Field(NAME_FIELD, "", store, index, termVector)); + + numericFields.put(DATE_MSEC_FIELD, new NumericField(DATE_MSEC_FIELD)); + numericFields.put(TIME_SEC_FIELD, new NumericField(TIME_SEC_FIELD)); doc = new Document(); } else { + numericFields = null; fields = null; doc = null; } @@ -124,18 +137,42 @@ } return f; } + + NumericField getNumericField(String name) { + if (!reuseFields) { + return new NumericField(name); + } + + NumericField f = numericFields.get(name); + if (f == null) { + f = new NumericField(name); + numericFields.put(name, f); + } + return f; + } } - private int numDocsCreated = 0; private boolean storeBytes = false; + private static class DateUtil { + public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.US); + public Calendar cal = Calendar.getInstance(); + public ParsePosition pos = new ParsePosition(0); + public DateUtil() { + parser.setLenient(true); + } + } + // leftovers are thread local, because it is unsafe to share residues between threads private ThreadLocal leftovr = new ThreadLocal(); private ThreadLocal docState = new ThreadLocal(); + private ThreadLocal dateParsers = new ThreadLocal(); public static final String BODY_FIELD = "body"; public static final String TITLE_FIELD = "doctitle"; public static final String DATE_FIELD = "docdate"; + public static final String DATE_MSEC_FIELD = "docdatenum"; + public static final String TIME_SEC_FIELD = "doctimesecnum"; public static final String ID_FIELD = "docid"; public static final String BYTES_FIELD = "bytes"; public static final String NAME_FIELD = "docname"; @@ -155,6 +192,7 @@ private int lastPrintedNumUniqueTexts = 0; private long lastPrintedNumUniqueBytes = 0; + private final AtomicInteger numDocsCreated = new AtomicInteger(); private int printNum = 0; @@ -169,7 +207,16 @@ // Set ID_FIELD Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal); - idField.setValue("doc" + (r != null ? r.nextInt(updateDocIDLimit) : incrNumDocsCreated())); + int id; + if (r != null) { + id = r.nextInt(updateDocIDLimit); + } else { + id = docData.getID(); + if (id == -1) { + id = numDocsCreated.getAndIncrement(); + } + } + idField.setValue(Integer.toString(id)); doc.add(idField); // Set NAME_FIELD @@ -181,13 +228,39 @@ doc.add(nameField); // Set DATE_FIELD - String date = docData.getDate(); + DateUtil util = dateParsers.get(); + if (util == null) { + util = new DateUtil(); + dateParsers.set(util); + } + Date date = null; + String dateString = docData.getDate(); + if (dateString != null) { + util.pos.setIndex(0); + date = util.parser.parse(dateString, util.pos); + //System.out.println(dateString + " parsed to " + date); + } else { + dateString = ""; + } + Field dateStringField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal); + dateStringField.setValue(dateString); + doc.add(dateStringField); + if (date == null) { - date = ""; + // just set to right now + date = new Date(); } - Field dateField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal); - dateField.setValue(date); + + NumericField dateField = ds.getNumericField(DATE_MSEC_FIELD); + dateField.setLongValue(date.getTime()); doc.add(dateField); + + util.cal.setTime(date); + final int sec = util.cal.get(Calendar.HOUR_OF_DAY)*3600 + util.cal.get(Calendar.MINUTE)*60 + util.cal.get(Calendar.SECOND); + + NumericField timeSecField = ds.getNumericField(TIME_SEC_FIELD); + timeSecField.setIntValue(sec); + doc.add(timeSecField); // Set TITLE_FIELD String title = docData.getTitle(); @@ -252,10 +325,6 @@ return ds; } - protected synchronized int incrNumDocsCreated() { - return numDocsCreated++; - } - /** * Closes the {@link DocMaker}. The base implementation closes the * {@link ContentSource}, and it can be overridden to do more work (but make @@ -363,7 +432,7 @@ // re-initiate since properties by round may have changed. setConfig(config); source.resetInputs(); - numDocsCreated = 0; + numDocsCreated.set(0); resetLeftovers(); } Index: lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java =================================================================== --- lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java (revision 1051154) +++ lucene/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocSource.java (working copy) @@ -48,6 +48,7 @@ private File file; private BufferedReader reader; + private int readCount; private synchronized void openFile() { try { @@ -71,9 +72,12 @@ @Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { - String line; + final String line; + final int myID; + synchronized(this) { line = reader.readLine(); + myID = readCount++; if (line == null) { if (!forever) { throw new NoMoreDataException(); @@ -96,6 +100,7 @@ } // The date String was written in the format of DateTools.dateToString. docData.clear(); + docData.setID(myID); docData.setBody(line.substring(1 + spot2, line.length())); docData.setTitle(line.substring(0, spot)); docData.setDate(line.substring(1 + spot, spot2));