Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java (revision 1843637) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java (working copy) @@ -29,11 +29,7 @@ import org.apache.jackrabbit.oak.plugins.index.search.FieldNames; import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.BlobByteSource; import org.apache.jackrabbit.util.ISO8601; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.IntField; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; +import org.apache.lucene.document.*; import static org.apache.lucene.document.Field.Store.NO; import static org.apache.lucene.document.Field.Store.YES; @@ -107,21 +103,25 @@ } public static Collection newSimilarityFields(String name, Blob value) throws IOException { - Collection fields = new ArrayList<>(1); + Collection fields = new ArrayList<>(2); byte[] bytes = new BlobByteSource(value).read(); -// fields.add(newBinarySimilarityField(name, bytes)); + fields.add(newBinarySimilarityField(name, bytes)); fields.add(newSimilarityField(name, bytes)); return fields; } public static Collection newSimilarityFields(String name, String value) { - Collection fields = new ArrayList<>(1); -// byte[] bytes = SimSearchUtils.toByteArray(value); -// fields.add(newBinarySimilarityField(name, bytes)); + Collection fields = new ArrayList<>(2); + byte[] bytes = SimSearchUtils.toByteArray(value); + fields.add(newBinarySimilarityField(name, bytes)); fields.add(newSimilarityField(name, value)); return fields; } + private static StoredField newBinarySimilarityField(String name, byte[] bytes) { + return new StoredField(FieldNames.createBinSimilarityFieldName(name), bytes); + } + private static Field newSimilarityField(String name, byte[] bytes) { return newSimilarityField(name, SimSearchUtils.toDoubleString(bytes)); } Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (revision 1843637) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (working copy) @@ -20,15 +20,7 @@ import javax.jcr.PropertyType; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Deque; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Predicate; @@ -120,6 +112,7 @@ import org.apache.lucene.search.postingshighlight.PostingsHighlighter; import org.apache.lucene.search.spell.SuggestWord; import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -128,6 +121,7 @@ import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Predicates.in; import static com.google.common.base.Predicates.notNull; import static com.google.common.collect.Lists.newArrayListWithCapacity; import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES; @@ -373,6 +367,34 @@ mergedFieldInfos = MultiFields.getMergedFieldInfos(searcher.getIndexReader()); } + if (docs.scoreDocs.length > 1) { + // reranking step for fv sim search + PropertyRestriction pr = null; + LuceneIndexDefinition defn = indexNode.getDefinition(); + if (defn.hasFunctionDefined()) { + pr = filter.getPropertyRestriction(defn.getFunctionName()); + } + if (pr != null) { + String queryString = String.valueOf(pr.first.getValue(pr.first.getType())); + if (queryString.startsWith("mlt?")) { + List sp = new LinkedList<>(); + for (IndexingRule r : defn.getDefinedRules()) { + List similarityProperties = r.getSimilarityProperties(); + for (PropertyDefinition pd : similarityProperties) { + if (pd.similarityRerank) { + sp.add(pd); + } + } + } + if (!sp.isEmpty()) { + long fvs = PERF_LOGGER.start(); + SimSearchUtils.bruteForceFVRerank(sp, docs, indexSearcher); + PERF_LOGGER.end(fvs, -1, "fv reranking done"); + } + } + } + } + for (ScoreDoc doc : docs.scoreDocs) { Map excerpts = null; if (addExcerpt) { Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java (revision 1843637) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java (working copy) @@ -327,6 +327,12 @@ return this; } + public PropertyRule useInSimilarity(boolean rerank) { + propTree.setProperty(LuceneIndexConstants.PROP_USE_IN_SIMILARITY, true); + propTree.setProperty(FulltextIndexConstants.PROP_SIMILARITY_RERANK, rerank); + return this; + } + public PropertyRule type(String type){ //This would throw an IAE if type is invalid PropertyType.valueFromName(type); Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java (revision 1843637) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java (working copy) @@ -18,10 +18,7 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Collection; -import java.util.LinkedList; -import java.util.List; +import java.util.*; import org.apache.jackrabbit.oak.plugins.index.search.FieldNames; import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition; @@ -39,6 +36,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.BytesRef; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,7 +50,7 @@ private static final Logger log = LoggerFactory.getLogger(SimSearchUtils.class); public static String toDoubleString(byte[] bytes) { - Double[] a = toDoubleArray(bytes); + double[] a = toDoubleArray(bytes); StringBuilder builder = new StringBuilder(); for (Double d : a) { if (builder.length() > 0) { @@ -63,11 +61,6 @@ return builder.toString(); } - private static Double[] toDoubleArray(byte[] array) { - List doubles = toDoubles(array); - return doubles.toArray(new Double[doubles.size()]); - } - public static List toDoubles(byte[] array) { int blockSize = Double.SIZE / Byte.SIZE; ByteBuffer wrap = ByteBuffer.wrap(array); @@ -80,6 +73,18 @@ return doubles; } + private static double[] toDoubleArray(byte[] array) { + int blockSize = Double.SIZE / Byte.SIZE; + ByteBuffer wrap = ByteBuffer.wrap(array); + int capacity = array.length / blockSize; + double[] doubles = new double[capacity]; + for (int i = 0; i < capacity; i++) { + double e = wrap.getDouble(i * blockSize); + doubles[i] = e; + } + return doubles; + } + private static Collection getTokens(Analyzer analyzer, String field, String sampleTextString) throws IOException { Collection tokens = new LinkedList<>(); TokenStream ts = analyzer.tokenStream(field, sampleTextString); @@ -174,4 +179,42 @@ } } -} + public static void bruteForceFVRerank(List sp, TopDocs docs, IndexSearcher indexSearcher) throws IOException { + double farthestDistance = 50d; + int k = 10; + ScoreDoc inputDoc = docs.scoreDocs[0]; // we assume the input doc is the first one returned + List toDiscard = new LinkedList<>(); + for (PropertyDefinition pd : sp) { + String fieldName = FieldNames.createBinSimilarityFieldName(pd.name); + BytesRef binaryValue = indexSearcher.doc(inputDoc.doc).getBinaryValue(fieldName); + double[] inputVector = SimSearchUtils.toDoubleArray(binaryValue.bytes); + for (int j = 0; j < docs.scoreDocs.length; j++) { + double[] currentVector = SimSearchUtils.toDoubleArray(indexSearcher.doc(docs.scoreDocs[j].doc) + .getBinaryValue(fieldName).bytes); + double distance = dist(inputVector, currentVector) + 1e-10; // constant term to avoid division by zero + + if (distance > farthestDistance) { // a threshold distance above which current vector is discarded + toDiscard.add(docs.scoreDocs[j].doc); + } + docs.scoreDocs[j].score += 1d / distance; // additive similarity boosting + } + } + if (!toDiscard.isEmpty()) { + docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> !toDiscard.contains(e.doc)).toArray(ScoreDoc[]::new); // remove docs that are not close enough + } + Arrays.sort(docs.scoreDocs, 1, docs.scoreDocs.length - 1, (o1, o2) -> (int) Math.rint(o2.score - o1.score)); // rerank scoreDocs + if (docs.scoreDocs.length > k) { + docs.scoreDocs = Arrays.copyOfRange(docs.scoreDocs, 0, k); // retain only the top k nearest neighbours + } + docs.setMaxScore(docs.scoreDocs[0].score); + } + + private static double dist(double[] x, double[] y) { // euclidean distance + double d = 0; + for (int i = 0; i < x.length; i++) { + d += Math.pow(y[i] - x[i], 2); + } + return Math.sqrt(d); + } + +} \ No newline at end of file Index: oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java =================================================================== --- oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (revision 1843637) +++ oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (working copy) @@ -2988,6 +2988,7 @@ String name = split[0]; Tree child = test.addChild(name); child.setProperty("fv", blob, Type.BINARY); + children.add(child.getPath()); } root.commit(); @@ -3006,7 +3007,6 @@ baseline.clear(); baseline.addAll(current); } - } @Test @@ -3054,6 +3054,107 @@ } } + @Test + public void testRepSimilarWithBinaryFeatureVectorsAndRerank() throws Exception { + + IndexDefinitionBuilder idxb = new IndexDefinitionBuilder().noAsync(); + idxb.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex().propertyIndex(); + + Tree idx = root.getTree("/").getChild("oak:index").addChild("test1"); + idxb.build(idx); + root.commit(); + + Tree test = root.getTree("/").addChild("test"); + + URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI(); + File file = new File(uri); + + Collection children = new LinkedList<>(); + for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) { + String[] split = line.split(","); + List values = new LinkedList<>(); + int i = 0; + for (String s : split) { + if (i > 0) { + values.add(Double.parseDouble(s)); + } + i++; + } + + byte[] bytes = SimSearchUtils.toByteArray(values); + List actual = SimSearchUtils.toDoubles(bytes); + assertEquals(values, actual); + + Blob blob = root.createBlob(new ByteArrayInputStream(bytes)); + String name = split[0]; + Tree child = test.addChild(name); + child.setProperty("fv", blob, Type.BINARY); + children.add(child.getPath()); + } + root.commit(); + + // check that similarity changes across different feature vectors + List baseline = new LinkedList<>(); + for (String similarPath : children) { + String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')"; + + Iterator result = executeQuery(query, "JCR-SQL2").iterator(); + List current = new LinkedList<>(); + while (result.hasNext()) { + String next = result.next(); + current.add(next); + } + assertNotEquals(baseline, current); + baseline.clear(); + baseline.addAll(current); + } + } + + @Test + public void testRepSimilarWithStringFeatureVectorsAndRerank() throws Exception { + + IndexDefinitionBuilder idxb = new IndexDefinitionBuilder().noAsync(); + idxb.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex().propertyIndex(); + + Tree idx = root.getTree("/").getChild("oak:index").addChild("test1"); + idxb.build(idx); + root.commit(); + + + Tree test = root.getTree("/").addChild("test"); + + URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI(); + File file = new File(uri); + + Collection children = new LinkedList<>(); + + for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) { + int i1 = line.indexOf(','); + String name = line.substring(0, i1); + String value = line.substring(i1 + 1); + Tree child = test.addChild(name); + child.setProperty("fv", value, Type.STRING); + children.add(child.getPath()); + } + root.commit(); + + // check that similarity changes across different feature vectors + List baseline = new LinkedList<>(); + for (String similarPath : children) { + String query = "select [jcr:path] from [nt:base] where similar(., '" + similarPath + "')"; + + Iterator result = executeQuery(query, "JCR-SQL2").iterator(); + List current = new LinkedList<>(); + while (result.hasNext()) { + String next = result.next(); + current.add(next); + } + assertNotEquals(baseline, current); + baseline.clear(); + baseline.addAll(current); + } + } + private void assertPlanAndQuery(String query, String planExpectation, List paths) { assertPlanAndQuery(query, planExpectation, paths, false); } Index: oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java =================================================================== --- oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java (revision 1843637) +++ oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java (working copy) @@ -60,6 +60,11 @@ private static final String SIMILARITY_PREFIX = "sim:"; /** + * Prefix for all field names that contains the similarity search binary values. + */ + private static final String SIMILARITY_BINARY_PREFIX = "simbin:"; + + /** * Name of the field that contains the suggest index. */ public static final String SUGGEST = ":suggest"; @@ -147,4 +152,8 @@ public static String createSimilarityFieldName(String name) { return SIMILARITY_PREFIX + name; } + + public static String createBinSimilarityFieldName(String name) { + return SIMILARITY_BINARY_PREFIX + name; + } } Index: oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java =================================================================== --- oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java (revision 1843637) +++ oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FulltextIndexConstants.java (working copy) @@ -249,12 +249,17 @@ */ String PROP_USE_IN_SPELLCHECK = "useInSpellcheck"; - /** - * whether use this property values for similarity - */ - String PROP_USE_IN_SIMILARITY = "useInSimilarity"; + /** + * whether use this property values for similarity + */ + String PROP_USE_IN_SIMILARITY = "useInSimilarity"; /** + * whether feature vector similarity search should rerank based on feature values + */ + String PROP_SIMILARITY_RERANK = "similarityRerank"; + + /** * Property definition config indicating that null check support should be * enabled for this property */ Index: oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java =================================================================== --- oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java (revision 1843637) +++ oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java (working copy) @@ -127,6 +127,8 @@ public final boolean useInSimilarity; + public final boolean similarityRerank; + public PropertyDefinition(IndexingRule idxDefn, String nodeName, NodeState defn) { this.isRegexp = getOptionalValue(defn, PROP_IS_REGEX, false); this.name = getName(defn, nodeName); @@ -156,6 +158,7 @@ this.useInSuggest = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_USE_IN_SUGGEST, false); this.useInSpellcheck = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_USE_IN_SPELLCHECK, false); this.useInSimilarity = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_USE_IN_SIMILARITY, false); + this.similarityRerank = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_SIMILARITY_RERANK, false); this.nullCheckEnabled = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_NULL_CHECK_ENABLED, false); this.notNullCheckEnabled = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_NOT_NULL_CHECK_ENABLED, false); this.excludeFromAggregate = getOptionalValueIfIndexed(defn, FulltextIndexConstants.PROP_EXCLUDE_FROM_AGGREGATE, false); @@ -233,6 +236,7 @@ ", analyzed=" + analyzed + ", ordered=" + ordered + ", useInSuggest=" + useInSuggest+ + ", useInSimilarity=" + useInSimilarity+ ", nullCheckEnabled=" + nullCheckEnabled + ", notNullCheckEnabled=" + notNullCheckEnabled + ", function=" + function +