Index: lucene/CHANGES.txt =================================================================== --- lucene/CHANGES.txt (revision 1491345) +++ lucene/CHANGES.txt (working copy) @@ -143,6 +143,10 @@ * LUCENE-5045: DrillSideways.search did not work on an empty index. (Shai Erera) +* LUCENE-5048: Creating CategoryPath with large components (larger than 65,535 + characters) could result in NegativeArraySizeException or categories being + added multiple times to the taxonomy. (Mike McCandless, Shai Erera) + Optimizations * LUCENE-4936: Improve numeric doc values compression in case all values share Index: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java (revision 1491345) +++ lucene/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java (working copy) @@ -64,9 +64,7 @@ public CategoryPath(final String... components) { assert components.length > 0 : "use CategoryPath.EMPTY to create an empty path"; for (String comp : components) { - if (comp == null || comp.isEmpty()) { - throw new IllegalArgumentException("empty or null components not allowed: " + Arrays.toString(components)); - } + verifyComponent(comp, components); } this.components = components; length = components.length; @@ -80,15 +78,23 @@ length = 0; } else { for (String comp : comps) { - if (comp == null || comp.isEmpty()) { - throw new IllegalArgumentException("empty or null components not allowed: " + Arrays.toString(comps)); - } + verifyComponent(comp, comps); } components = comps; length = components.length; } } + private void verifyComponent(String comp, String[] comps) { + if (comp == null || comp.isEmpty()) { + throw new IllegalArgumentException("empty or null component not allowed: " + Arrays.toString(comps)); + } + if (comp.length() > 65535) { + // this results in incorrect behavior in e.g. Cl2oTaxoWriterCache; see TestDirectoryTaxonomyWriter.testHugeLabel + throw new IllegalArgumentException("component too large (max=65535, actual=" + comp.length() + "): " + Arrays.toString(comps)); + } + } + /** * Returns the number of characters needed to represent the path, including * delimiter characters, for using with Index: lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CategoryPathUtils.java =================================================================== --- lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CategoryPathUtils.java (revision 1491345) +++ lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CategoryPathUtils.java (working copy) @@ -39,14 +39,14 @@ * {@link #serialize(CategoryPath, CharBlockArray)}. */ public static int hashCodeOfSerialized(CharBlockArray charBlockArray, int offset) { - int length = (short) charBlockArray.charAt(offset++); + int length = charBlockArray.charAt(offset++); if (length == 0) { return 0; } int hash = length; for (int i = 0; i < length; i++) { - int len = (short) charBlockArray.charAt(offset++); + int len = charBlockArray.charAt(offset++); hash = hash * 31 + charBlockArray.subSequence(offset, offset + len).hashCode(); offset += len; } @@ -67,7 +67,7 @@ } for (int i = 0; i < cp.length; i++) { - int len = (short) charBlockArray.charAt(offset++); + int len = charBlockArray.charAt(offset++); if (len != cp.components[i].length()) { return false; } Index: lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java (revision 1491345) +++ lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java (working copy) @@ -3,6 +3,7 @@ import java.util.Arrays; import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.util._TestUtil; import org.junit.Test; /* @@ -274,5 +275,31 @@ // expected } } + + @Test + public void testLargeComponent() throws Exception { + String bigComp = null; + while (true) { + bigComp = _TestUtil.randomRealisticUnicodeString(random(), 65536, 65536); + if (bigComp.indexOf('\u001f') != -1) { + continue; + } + break; + } + + try { + assertNotNull(new CategoryPath("dim", bigComp)); + fail("large components should not be allowed; len=" + bigComp.length()); + } catch (IllegalArgumentException e) { + // expected + } + + try { + assertNotNull(new CategoryPath("dim\u001f" + bigComp, '\u001f')); + fail("large components should not be allowed; len=" + bigComp.length()); + } catch (IllegalArgumentException e) { + // expected + } + } } Index: lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java =================================================================== --- lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java (revision 1491345) +++ lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java (working copy) @@ -17,11 +17,12 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; +import org.apache.lucene.util._TestUtil; import org.junit.Test; /* @@ -412,5 +413,37 @@ taxoWriter.close(); dir.close(); } - + + @Test + public void testHugeLabel() throws Exception { + Directory dir = newDirectory(); + DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE, new Cl2oTaxonomyWriterCache(2, 1f, 1)); + + // Add one huge label: + String bigs = null; + int ordinal = -1; + while (true) { + bigs = _TestUtil.randomRealisticUnicodeString(random(), 32767, 32767); + if (bigs.indexOf('\u001f') != -1) { + continue; + } + ordinal = taxoWriter.addCategory(new CategoryPath("dim", bigs)); + break; + } + + // Add tiny ones to cause a re-hash + for (int i = 0; i < 3; i++) { + String s = _TestUtil.randomUnicodeString(random()); + if (s.isEmpty() || s.indexOf('\u001f') != -1) { + continue; + } + taxoWriter.addCategory(new CategoryPath("dim", s)); + } + + // when too large components were allowed to be added, this resulted in a new added category + assertEquals(ordinal, taxoWriter.addCategory(new CategoryPath("dim", bigs))); + + taxoWriter.close(); + dir.close(); + } }