Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1491345) +++ lucene/CHANGES.txt (working copy) @@ -143,6 +143,10 @@ * LUCENE-5045: DrillSideways.search did not work on an empty index. (Shai Erera) +* LUCENE-5048: Creating CategoryPath with large components (larger than 65,535 + characters) could result in NegativeArraySizeException or categories being + added multiple times to the taxonomy. (Colton Jamieson, Mike McCandless, Shai Erera) + Optimizations * LUCENE-4936: Improve numeric doc values compression in case all values share Index: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java (revision 1491345) +++ lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java (working copy) @@ -64,9 +64,7 @@ public CategoryPath(final String... components) { assert components.length > 0 : "use CategoryPath.EMPTY to create an empty path"; for (String comp : components) { - if (comp == null || comp.isEmpty()) { - throw new IllegalArgumentException("empty or null components not allowed: " + Arrays.toString(components)); - } + verifyComponent(comp, components); } this.components = components; length = components.length; @@ -80,15 +78,23 @@ length = 0; } else { for (String comp : comps) { - if (comp == null || comp.isEmpty()) { - throw new IllegalArgumentException("empty or null components not allowed: " + Arrays.toString(comps)); - } + verifyComponent(comp, comps); } components = comps; length = components.length; } } + private void verifyComponent(String comp, String[] comps) { + if (comp == null || comp.isEmpty()) { + throw new IllegalArgumentException("empty or null component not allowed: " + Arrays.toString(comps)); + } + if (comp.length() > 32767) { + // this results in incorrect behavior in e.g. Cl2oTaxoWriterCache; see TestDirectoryTaxonomyWriter.testHugeLabel + throw new IllegalArgumentException("component too large (max=32767, actual=" + comp.length() + "): " + Arrays.toString(comps)); + } + } + /** * Returns the number of characters needed to represent the path, including * delimiter characters, for using with Index: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CategoryPathUtils.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CategoryPathUtils.java (revision 1491345) +++ lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CategoryPathUtils.java (working copy) @@ -39,14 +39,14 @@ * {@link #serialize(CategoryPath, CharBlockArray)}. */ public static int hashCodeOfSerialized(CharBlockArray charBlockArray, int offset) { - int length = (short) charBlockArray.charAt(offset++); + int length = charBlockArray.charAt(offset++); if (length == 0) { return 0; } int hash = length; for (int i = 0; i < length; i++) { - int len = (short) charBlockArray.charAt(offset++); + int len = charBlockArray.charAt(offset++); hash = hash * 31 + charBlockArray.subSequence(offset, offset + len).hashCode(); offset += len; } @@ -67,7 +67,7 @@ } for (int i = 0; i < cp.length; i++) { - int len = (short) charBlockArray.charAt(offset++); + int len = charBlockArray.charAt(offset++); if (len != cp.components[i].length()) { return false; } Index: lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java (revision 1491345) +++ lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java (working copy) @@ -3,6 +3,7 @@ import java.util.Arrays; import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.util._TestUtil; import org.junit.Test; /* @@ -274,5 +275,31 @@ // expected } } + + @Test + public void testLargeComponent() throws Exception { + String bigComp = null; + while (true) { + bigComp = _TestUtil.randomRealisticUnicodeString(random(), 32768, 32768); + if (bigComp.indexOf('\u001f') != -1) { + continue; + } + break; + } + + try { + assertNotNull(new CategoryPath("dim", bigComp)); + fail("large components should not be allowed; len=" + bigComp.length()); + } catch (IllegalArgumentException e) { + // expected + } + + try { + assertNotNull(new CategoryPath("dim\u001f" + bigComp, '\u001f')); + fail("large components should not be allowed; len=" + bigComp.length()); + } catch (IllegalArgumentException e) { + // expected + } + } } Index: lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java (revision 1491345) +++ lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java (working copy) @@ -1,13 +1,20 @@ package org.apache.lucene.facet.taxonomy.directory; import java.io.IOException; +import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.Random; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.params.CategoryListParams; +import org.apache.lucene.facet.params.FacetIndexingParams; +import org.apache.lucene.facet.search.DrillDownQuery; import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap; @@ -19,9 +26,14 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util._TestUtil; import org.junit.Test; /* @@ -412,5 +424,64 @@ taxoWriter.close(); dir.close(); } - + + @Test + public void testHugeLabel() throws Exception { + Directory indexDir = newDirectory(), taxoDir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new Cl2oTaxonomyWriterCache(2, 1f, 1)); + FacetFields facetFields = new FacetFields(taxoWriter); + + // Add one huge label: + String bigs = null; + int ordinal = -1; + CategoryPath cp = null; + while (true) { + int len = 10921; // nocommit inline and restore to 32767? + bigs = _TestUtil.randomRealisticUnicodeString(random(), len, len); + if (bigs.indexOf('\u001f') != -1) { + continue; + } + cp = new CategoryPath("dim", bigs); + ordinal = taxoWriter.addCategory(cp); + Document doc = new Document(); + facetFields.addFields(doc, Collections.singletonList(cp)); + indexWriter.addDocument(doc); + break; + } + + // Add tiny ones to cause a re-hash + for (int i = 0; i < 3; i++) { + String s = _TestUtil.randomUnicodeString(random()); + if (s.isEmpty() || s.indexOf('\u001f') != -1) { + continue; + } + taxoWriter.addCategory(new CategoryPath("dim", s)); + Document doc = new Document(); + facetFields.addFields(doc, Collections.singletonList(new CategoryPath("dim", s))); + indexWriter.addDocument(doc); + } + + // when too large components were allowed to be added, this resulted in a new added category + assertEquals(ordinal, taxoWriter.addCategory(cp)); + + IOUtils.close(indexWriter, taxoWriter); + + DirectoryReader indexReader = DirectoryReader.open(indexDir); + // nocommit remove + TermsEnum te = MultiFields.getTerms(indexReader, CategoryListParams.DEFAULT_FIELD).iterator(null); + while (te.next() != null) { + String str = te.term().utf8ToString(); + System.out.println(str.length() + " : " + str); + } + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + IndexSearcher searcher = new IndexSearcher(indexReader); + DrillDownQuery ddq = new DrillDownQuery(FacetIndexingParams.DEFAULT); + ddq.add(cp); + assertEquals(1, searcher.search(ddq, 10).totalHits); + + IOUtils.close(indexReader, taxoReader); + + IOUtils.close(indexDir, taxoDir); + } }