Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java (revision 1843494) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java (working copy) @@ -29,11 +29,7 @@ import org.apache.jackrabbit.oak.plugins.index.search.FieldNames; import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.BlobByteSource; import org.apache.jackrabbit.util.ISO8601; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.IntField; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; +import org.apache.lucene.document.*; import static org.apache.lucene.document.Field.Store.NO; import static org.apache.lucene.document.Field.Store.YES; @@ -107,21 +103,25 @@ } public static Collection newSimilarityFields(String name, Blob value) throws IOException { - Collection fields = new ArrayList<>(1); + Collection fields = new ArrayList<>(2); byte[] bytes = new BlobByteSource(value).read(); -// fields.add(newBinarySimilarityField(name, bytes)); + fields.add(newBinarySimilarityField(name, bytes)); fields.add(newSimilarityField(name, bytes)); return fields; } public static Collection newSimilarityFields(String name, String value) { - Collection fields = new ArrayList<>(1); -// byte[] bytes = SimSearchUtils.toByteArray(value); -// fields.add(newBinarySimilarityField(name, bytes)); + Collection fields = new ArrayList<>(2); + byte[] bytes = SimSearchUtils.toByteArray(value); + fields.add(newBinarySimilarityField(name, bytes)); fields.add(newSimilarityField(name, value)); return fields; } + private static StoredField newBinarySimilarityField(String name, byte[] bytes) { + return new StoredField(FieldNames.createBinSimilarityFieldName(name), bytes); + } + private static Field newSimilarityField(String name, byte[] bytes) { return newSimilarityField(name, SimSearchUtils.toDoubleString(bytes)); } Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (revision 1843494) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (working copy) @@ -20,15 +20,7 @@ import javax.jcr.PropertyType; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Deque; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Predicate; @@ -120,6 +112,7 @@ import org.apache.lucene.search.postingshighlight.PostingsHighlighter; import org.apache.lucene.search.spell.SuggestWord; import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; @@ -373,6 +366,29 @@ mergedFieldInfos = MultiFields.getMergedFieldInfos(searcher.getIndexReader()); } + if (docs.scoreDocs.length > 1) { + // reranking step for fv sim search + PropertyRestriction pr = null; + LuceneIndexDefinition defn = indexNode.getDefinition(); + if (defn.hasFunctionDefined()) { + pr = filter.getPropertyRestriction(defn.getFunctionName()); + } + if (pr != null) { + String queryString = String.valueOf(pr.first.getValue(pr.first.getType())); + if (queryString.startsWith("mlt?")) { + List sp = new LinkedList<>(); + for (IndexingRule r : defn.getDefinedRules()) { + sp.addAll(r.getSimilarityProperties()); + } + if (!sp.isEmpty()) { + long fvs = PERF_LOGGER.start(); + bruteForceFVRerank(sp, docs); + PERF_LOGGER.end(fvs, -1, "fv reranking done"); + } + } + } + } + for (ScoreDoc doc : docs.scoreDocs) { Map excerpts = null; if (addExcerpt) { @@ -473,6 +489,45 @@ return !queue.isEmpty(); } + private void bruteForceFVRerank(List sp, TopDocs docs) throws IOException { + double farthestDistance = 50d; + int k = 10; + ScoreDoc inputDoc = docs.scoreDocs[0]; // we assume the input doc is the first one returned + List toDiscard = new LinkedList<>(); + for (PropertyDefinition pd : sp) { + String fieldName = FieldNames.createBinSimilarityFieldName(pd.name); + BytesRef binaryValue = indexSearcher.doc(inputDoc.doc).getBinaryValue(fieldName); + double[] inputVector = SimSearchUtils.toDoubleArray(binaryValue.bytes); + for (int j = 0; j < docs.scoreDocs.length; j++) { + double[] currentVector = SimSearchUtils.toDoubleArray(indexSearcher.doc(docs.scoreDocs[j].doc) + .getBinaryValue(fieldName).bytes); + double distance = dist(inputVector, currentVector) + 1e-10; // constant term to avoid division by zero + + if (distance > farthestDistance) { // a threshold distance above which current vector is discarded + toDiscard.add(docs.scoreDocs[j].doc); + } + docs.scoreDocs[j].score += 1d / distance; // additive similarity boosting + } + } + if (!toDiscard.isEmpty()) { + docs.scoreDocs = Arrays.stream(docs.scoreDocs).filter(e -> !toDiscard.contains(e.doc)).toArray(ScoreDoc[]::new); // remove docs that are not close enough + } + Arrays.sort(docs.scoreDocs, 1, docs.scoreDocs.length - 1, (o1, o2) -> (int) Math.rint(o2.score - o1.score)); // rerank scoreDocs + if (docs.scoreDocs.length > k) { + docs.scoreDocs = Arrays.copyOfRange(docs.scoreDocs, 0, k); // retain only the top k nearest neighbours + } + docs.setMaxScore(docs.scoreDocs[0].score); + } + + private double dist(double[] x, double[] y) { + double d = 0; + for (int i = 0; i < x.length; i++) { + d += Math.pow(y[i] - x[i], 2); + } + return Math.sqrt(d); + } + + private IndexSearcher getCurrentSearcher(LuceneIndexNode indexNode) { //The searcher once obtained is held till either cursor is finished //or if the index gets updated. It needs to be ensured that Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java (revision 1843494) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java (working copy) @@ -52,7 +52,7 @@ private static final Logger log = LoggerFactory.getLogger(SimSearchUtils.class); public static String toDoubleString(byte[] bytes) { - Double[] a = toDoubleArray(bytes); + double[] a = toDoubleArray(bytes); StringBuilder builder = new StringBuilder(); for (Double d : a) { if (builder.length() > 0) { @@ -63,11 +63,6 @@ return builder.toString(); } - private static Double[] toDoubleArray(byte[] array) { - List doubles = toDoubles(array); - return doubles.toArray(new Double[doubles.size()]); - } - public static List toDoubles(byte[] array) { int blockSize = Double.SIZE / Byte.SIZE; ByteBuffer wrap = ByteBuffer.wrap(array); @@ -80,6 +75,18 @@ return doubles; } + public static double[] toDoubleArray(byte[] array) { + int blockSize = Double.SIZE / Byte.SIZE; + ByteBuffer wrap = ByteBuffer.wrap(array); + int capacity = array.length / blockSize; + double[] doubles = new double[capacity]; + for (int i = 0; i < capacity; i++) { + double e = wrap.getDouble(i * blockSize); + doubles[i] = e; + } + return doubles; + } + private static Collection getTokens(Analyzer analyzer, String field, String sampleTextString) throws IOException { Collection tokens = new LinkedList<>(); TokenStream ts = analyzer.tokenStream(field, sampleTextString); Index: oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java =================================================================== --- oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (revision 1843494) +++ oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (working copy) @@ -2988,6 +2988,7 @@ String name = split[0]; Tree child = test.addChild(name); child.setProperty("fv", blob, Type.BINARY); + children.add(child.getPath()); } root.commit(); Index: oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java =================================================================== --- oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java (revision 1843494) +++ oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/FieldNames.java (working copy) @@ -60,6 +60,11 @@ private static final String SIMILARITY_PREFIX = "sim:"; /** + * Prefix for all field names that contains the similarity search binary values. + */ + private static final String SIMILARITY_BINARY_PREFIX = "simbin:"; + + /** * Name of the field that contains the suggest index. */ public static final String SUGGEST = ":suggest"; @@ -147,4 +152,8 @@ public static String createSimilarityFieldName(String name) { return SIMILARITY_PREFIX + name; } + + public static String createBinSimilarityFieldName(String name) { + return SIMILARITY_BINARY_PREFIX + name; + } }