Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java (revision 1847134) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzer.java (working copy) @@ -30,7 +30,7 @@ */ public class LSHAnalyzer extends Analyzer { - private static final int DEFAULT_SHINGLE_SIZE = 4; + private static final int DEFAULT_SHINGLE_SIZE = 5; private final int min; private final int max; Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java (revision 1847134) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/SimSearchUtils.java (working copy) @@ -39,6 +39,7 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.BytesRef; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -80,15 +81,15 @@ return doubles; } - private static Collection getTokens(Analyzer analyzer, String field, String sampleTextString) throws IOException { - Collection tokens = new LinkedList<>(); + private static Collection getTokens(Analyzer analyzer, String field, String sampleTextString) throws IOException { + Collection tokens = new LinkedList<>(); TokenStream ts = analyzer.tokenStream(field, sampleTextString); + ts.addAttribute(CharTermAttribute.class); ts.reset(); - ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class); String token = new String(charTermAttribute.buffer(), 0, charTermAttribute.length()); - tokens.add(token); + tokens.add(new BytesRef(token)); } ts.end(); ts.close(); @@ -95,14 +96,8 @@ return tokens; } - static BooleanQuery getSimQuery(Analyzer analyzer, String fieldName, String text) throws IOException { - Collection tokens = getTokens(analyzer, fieldName, text); - BooleanQuery booleanQuery = new BooleanQuery(true); - booleanQuery.setMinimumNumberShouldMatch(3); - for (String token : tokens) { - booleanQuery.add(new ConstantScoreQuery(new TermQuery(new Term(fieldName, token))), BooleanClause.Occur.SHOULD); - } - return booleanQuery; + static Query getSimQuery(Analyzer analyzer, String fieldName, String text) throws IOException { + return createLSHQuery(fieldName, getTokens(analyzer, fieldName, text), 1f,1f); } @@ -156,7 +151,7 @@ String fvString = doc.get(similarityFieldName); if (fvString != null && fvString.trim().length() > 0) { log.trace("generating sim query on field {} and text {}", similarityFieldName, fvString); - BooleanQuery simQuery = SimSearchUtils.getSimQuery(analyzer, similarityFieldName, fvString); + Query simQuery = SimSearchUtils.getSimQuery(analyzer, similarityFieldName, fvString); booleanQuery.add(new BooleanClause(simQuery, SHOULD)); log.trace("similarity query generated for {}", pd.name); } @@ -174,4 +169,63 @@ } } + private static Query createLSHQuery(String field, Collection minhashes, + float similarity, float expectedTruePositive) { + int bandSize = 1; + if (expectedTruePositive < 1) { + bandSize = computeBandSize(minhashes.size(), similarity, expectedTruePositive); + } + + BooleanQuery builder = new BooleanQuery(); + BooleanQuery childBuilder = new BooleanQuery(); + int rowInBand = 0; + for (BytesRef minHash : minhashes) { + TermQuery tq = new TermQuery(new Term(field, minHash)); + if (bandSize == 1) { + builder.add(new ConstantScoreQuery(tq), BooleanClause.Occur.SHOULD); + } else { + childBuilder.add(new ConstantScoreQuery(tq), BooleanClause.Occur.MUST); + rowInBand++; + if (rowInBand == bandSize) { + builder.add(new ConstantScoreQuery(childBuilder), + BooleanClause.Occur.SHOULD); + childBuilder = new BooleanQuery(); + rowInBand = 0; + } + } + } + // Avoid a dubious narrow band, wrap around and pad with the start + if (childBuilder.clauses().size() > 0) { + for (BytesRef token : minhashes) { + TermQuery tq = new TermQuery(new Term(field, token.toString())); + childBuilder.add(new ConstantScoreQuery(tq), BooleanClause.Occur.MUST); + rowInBand++; + if (rowInBand == bandSize) { + builder.add(new ConstantScoreQuery(childBuilder), + BooleanClause.Occur.SHOULD); + break; + } + } + } + + if (expectedTruePositive >= 1.0 && similarity < 1) { + builder.setMinimumNumberShouldMatch((int) (Math.ceil(minhashes.size() * similarity))); + } + log.trace("similarity query with bands : {}, minShouldMatch : {}, no. of clauses : {}", bandSize, + builder.getMinimumNumberShouldMatch(), builder.clauses().size()); + return builder; + + } + + private static int computeBandSize(int numHash, double similarity, double expectedTruePositive) { + for (int bands = 1; bands <= numHash; bands++) { + int rowsInBand = numHash / bands; + double truePositive = 1 - Math.pow(1 - Math.pow(similarity, rowsInBand), bands); + if (truePositive > expectedTruePositive) { + return rowsInBand; + } + } + return 1; + } + } Index: oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java =================================================================== --- oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (revision 1847134) +++ oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (working copy) @@ -2988,6 +2988,7 @@ String name = split[0]; Tree child = test.addChild(name); child.setProperty("fv", blob, Type.BINARY); + children.add(child.getPath()); } root.commit(); @@ -3019,7 +3020,6 @@ idxb.build(idx); root.commit(); - Tree test = root.getTree("/").addChild("test"); URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI(); Index: oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzerTest.java =================================================================== --- oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzerTest.java (revision 1847134) +++ oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/fv/LSHAnalyzerTest.java (working copy) @@ -80,7 +80,6 @@ IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_47, analyzer)); DirectoryReader reader = null; try { - List values = new LinkedList<>(); values.add(0.1d); values.add(0.3d);