Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java (revision 1833679) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldFactory.java (working copy) @@ -16,15 +16,22 @@ */ package org.apache.jackrabbit.oak.plugins.index.lucene; +import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import com.google.common.primitives.Ints; +import org.apache.jackrabbit.oak.api.Blob; import org.apache.jackrabbit.oak.api.Type; import org.apache.jackrabbit.oak.commons.PathUtils; +import org.apache.jackrabbit.oak.plugins.index.lucene.binary.BlobByteSource; +import org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch.SimSearchUtils; import org.apache.jackrabbit.util.ISO8601; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.IntField; +import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; @@ -103,6 +110,34 @@ return new StringField(name, value, NO); } + static Collection newSimilarityFields(String name, Blob value) throws IOException { + Collection fields = new ArrayList<>(2); + byte[] bytes = new BlobByteSource(value).read(); + fields.add(newBinarySimilarityField(name, bytes)); + fields.add(newSimilarityField(name, bytes)); + return fields; + } + + static Collection newSimilarityFields(String name, String value) { + Collection fields = new ArrayList<>(2); + byte[] bytes = SimSearchUtils.toByteArray(value); + fields.add(newBinarySimilarityField(name, bytes)); + fields.add(newSimilarityField(name, value)); + return fields; + } + + private static Field newSimilarityField(String name, byte[] bytes) { + return newSimilarityField(name, SimSearchUtils.toDoubleString(bytes)); + } + + private static Field newSimilarityField(String name, String value) { + return new TextField(FieldNames.createSimilarityFieldName(name), value, Field.Store.NO); + } + + private static StoredField newBinarySimilarityField(String name, byte[] bytes) { + return new StoredField(FieldNames.createBinSimilarityFieldName(name), bytes); + } + public static Field newFulltextField(String value) { return newFulltextField(value, false); } Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java (revision 1833679) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/FieldNames.java (working copy) @@ -70,6 +70,16 @@ public static final String ANALYZED_FIELD_PREFIX = "full:"; /** + * Name of the field that contains the similarity search indexed tokens. + */ + private static final String SIMILARITY_PREFIX = "sim:"; + + /** + * Prefix for all field names that contains the similarity search binary values. + */ + private static final String SIMILARITY_BINARY_PREFIX = "simbin:"; + + /** * Prefix used for storing fulltext of relative node */ public static final String FULLTEXT_RELATIVE_NODE = "fullnode:"; @@ -138,4 +148,12 @@ && !field.startsWith(":") && !field.endsWith("_facet"); } + + public static String createBinSimilarityFieldName(String name) { + return SIMILARITY_BINARY_PREFIX + name; + } + + public static String createSimilarityFieldName(String name) { + return SIMILARITY_PREFIX + name; + } } Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java (revision 1833679) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java (working copy) @@ -911,6 +911,7 @@ private final List notNullCheckEnabledProperties; private final List nodeScopeAnalyzedProps; private final List syncProps; + private final List similarityProperties; private final boolean indexesAllNodesOfMatchingType; private final boolean nodeNameIndexed; @@ -925,6 +926,7 @@ final Aggregate propAggregate; + IndexingRule(String nodeTypeName, NodeState config) { this.nodeTypeName = nodeTypeName; this.baseNodeType = nodeTypeName; @@ -938,9 +940,10 @@ List existentProperties = newArrayList(); List nodeScopeAnalyzedProps = newArrayList(); List syncProps = newArrayList(); + List similarityProperties = newArrayList(); List propIncludes = newArrayList(); this.propConfigs = collectPropConfigs(config, namePatterns, propIncludes, nonExistentProperties, - existentProperties, nodeScopeAnalyzedProps, functionRestrictions, syncProps); + existentProperties, nodeScopeAnalyzedProps, functionRestrictions, syncProps, similarityProperties); this.propAggregate = new Aggregate(nodeTypeName, propIncludes); this.aggregate = combine(propAggregate, nodeTypeName); @@ -949,6 +952,7 @@ this.nullCheckEnabledProperties = ImmutableList.copyOf(nonExistentProperties); this.functionRestrictions = ImmutableList.copyOf(functionRestrictions); this.notNullCheckEnabledProperties = ImmutableList.copyOf(existentProperties); + this.similarityProperties = ImmutableList.copyOf(similarityProperties); this.fulltextEnabled = aggregate.hasNodeAggregates() || hasAnyFullTextEnabledProperty(); this.nodeFullTextIndexed = aggregate.hasNodeAggregates() || anyNodeScopeIndexedProperty(); this.propertyIndexEnabled = hasAnyPropertyIndexConfigured(); @@ -985,6 +989,7 @@ this.indexesAllNodesOfMatchingType = areAlMatchingNodeByTypeIndexed(); this.nodeNameIndexed = original.nodeNameIndexed; this.syncProps = original.syncProps; + this.similarityProperties = original.similarityProperties; } /** @@ -1032,6 +1037,10 @@ return nodeScopeAnalyzedProps; } + public List getSimilarityProperties() { + return similarityProperties; + } + @Override public String toString() { String str = "IndexRule: "+ nodeTypeName; @@ -1153,7 +1162,8 @@ List existentProperties, List nodeScopeAnalyzedProps, List functionRestrictions, - List syncProps) { + List syncProps, + List similarityProperties) { Map propDefns = newHashMap(); NodeState propNode = config.getChildNode(LuceneIndexConstants.PROP_NODE); @@ -1232,6 +1242,9 @@ if (pd.sync) { syncProps.add(pd); } + if (pd.useInSimilarity) { + similarityProperties.add(pd); + } } } ensureNodeTypeIndexingIsConsistent(propDefns, syncProps); Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java (revision 1833679) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java (working copy) @@ -62,6 +62,7 @@ import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newFulltextField; import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newPathField; import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newPropertyField; +import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newSimilarityFields; import static org.apache.jackrabbit.oak.plugins.index.lucene.util.ConfigUtil.getPrimaryTypeName; public class LuceneDocumentMaker { @@ -253,7 +254,15 @@ boolean includeTypeForFullText = indexingRule.includePropertyType(property.getType().tag()); boolean dirty = false; - if (Type.BINARY.tag() == property.getType().tag() + if (Type.BINARY.tag() == property.getType().tag() && pd.useInSimilarity) { + try { + log.trace("indexing similarity binaries for {}", pd.name); + fields.addAll(newSimilarityFields(pd.name, property.getValue(Type.BINARY))); + dirty = true; + } catch (Exception e) { + log.error("could not index similarity field for property {} and definition {}", property, pd); + } + } else if (Type.BINARY.tag() == property.getType().tag() && includeTypeForFullText) { fields.addAll(newBinary(property, state, null, path + "@" + pname)); dirty = true; @@ -285,10 +294,17 @@ if (pd.nodeScopeIndex) { Field field = newFulltextField(value); fields.add(field); + if (pd.useInSimilarity) { + log.info("indexing similarity strings for {}", pd.name); + fields.addAll(newSimilarityFields(pd.name, value)); // fallback for when feature vectors are written in string typed properties + } } + + dirty = true; } } + if (pd.facet && isFacetingEnabled()) { dirty |= addFacetFields(fields, property, pname, pd); } Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java (revision 1833679) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java (working copy) @@ -305,6 +305,11 @@ String PROP_USE_IN_SPELLCHECK = "useInSpellcheck"; /** + * whether use this property values for similarity + */ + String PROP_USE_IN_SIMILARITY = "useInSimilarity"; + + /** * Property definition config indicating that null check support should be * enabled for this property */ Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (revision 1833679) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndex.java (working copy) @@ -18,10 +18,6 @@ */ package org.apache.jackrabbit.oak.plugins.index.lucene; -import javax.annotation.CheckForNull; -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import javax.jcr.PropertyType; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -33,8 +29,12 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; -import java.util.stream.Collectors; +import javax.annotation.CheckForNull; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import javax.jcr.PropertyType; + import com.google.common.base.Joiner; import com.google.common.collect.AbstractIterator; import com.google.common.collect.FluentIterable; @@ -52,6 +52,8 @@ import org.apache.jackrabbit.oak.commons.PerfLogger; import org.apache.jackrabbit.oak.commons.json.JsopBuilder; import org.apache.jackrabbit.oak.commons.json.JsopWriter; +import org.apache.jackrabbit.oak.plugins.index.Cursors; +import org.apache.jackrabbit.oak.plugins.index.Cursors.PathCursor; import org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.IndexingRule; import org.apache.jackrabbit.oak.plugins.index.lucene.IndexPlanner.PlanResult; import org.apache.jackrabbit.oak.plugins.index.lucene.IndexPlanner.PropertyIndexResult; @@ -58,21 +60,15 @@ import org.apache.jackrabbit.oak.plugins.index.lucene.property.HybridPropertyIndexLookup; import org.apache.jackrabbit.oak.plugins.index.lucene.score.ScorerProviderFactory; import org.apache.jackrabbit.oak.plugins.index.lucene.spi.FulltextQueryTermsProvider; +import org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch.BinaryMinHashAnalyzer; import org.apache.jackrabbit.oak.plugins.index.lucene.util.FacetHelper; import org.apache.jackrabbit.oak.plugins.index.lucene.util.MoreLikeThisHelper; import org.apache.jackrabbit.oak.plugins.index.lucene.util.PathStoredFieldVisitor; import org.apache.jackrabbit.oak.plugins.index.lucene.util.SpellcheckHelper; import org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper; +import org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch.SimSearchUtils; import org.apache.jackrabbit.oak.plugins.memory.PropertyValues; -import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextAnd; -import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextContains; -import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextExpression; -import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr; -import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm; -import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor; import org.apache.jackrabbit.oak.spi.query.Cursor; -import org.apache.jackrabbit.oak.plugins.index.Cursors; -import org.apache.jackrabbit.oak.plugins.index.Cursors.PathCursor; import org.apache.jackrabbit.oak.spi.query.Filter; import org.apache.jackrabbit.oak.spi.query.Filter.PropertyRestriction; import org.apache.jackrabbit.oak.spi.query.IndexRow; @@ -80,6 +76,12 @@ import org.apache.jackrabbit.oak.spi.query.QueryIndex; import org.apache.jackrabbit.oak.spi.query.QueryIndex.AdvanceFulltextQueryIndex; import org.apache.jackrabbit.oak.spi.query.QueryLimits; +import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextAnd; +import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextContains; +import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextExpression; +import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextOr; +import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextTerm; +import org.apache.jackrabbit.oak.spi.query.fulltext.FullTextVisitor; import org.apache.jackrabbit.oak.spi.state.NodeState; import org.apache.jackrabbit.oak.spi.state.NodeStateUtils; import org.apache.lucene.analysis.Analyzer; @@ -125,6 +127,7 @@ import org.apache.lucene.search.postingshighlight.PostingsHighlighter; import org.apache.lucene.search.spell.SuggestWord; import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -150,7 +153,9 @@ import static org.apache.jackrabbit.oak.spi.query.QueryConstants.REP_EXCERPT; import static org.apache.jackrabbit.oak.spi.query.QueryIndex.AdvancedQueryIndex; import static org.apache.jackrabbit.oak.spi.query.QueryIndex.NativeQueryIndex; -import static org.apache.lucene.search.BooleanClause.Occur.*; +import static org.apache.lucene.search.BooleanClause.Occur.MUST; +import static org.apache.lucene.search.BooleanClause.Occur.MUST_NOT; +import static org.apache.lucene.search.BooleanClause.Occur.SHOULD; /** * Provides a QueryIndex that does lookups against a Lucene-based index @@ -917,10 +922,21 @@ if (query.startsWith("mlt?")) { String mltQueryString = query.replace("mlt?", ""); if (reader != null) { - Query moreLikeThis = MoreLikeThisHelper.getMoreLikeThis(reader, analyzer, mltQueryString); - if (moreLikeThis != null) { - qs.add(moreLikeThis); + List sp = new LinkedList<>(); + for (IndexingRule r : defn.getDefinedRules()) { + sp.addAll(r.getSimilarityProperties()); } + if (sp.isEmpty()) { + Query moreLikeThis = MoreLikeThisHelper.getMoreLikeThis(reader, analyzer, mltQueryString); + if (moreLikeThis != null) { + qs.add(moreLikeThis); + } + } else { + Query similarityQuery = getSimilarityQuery(sp, reader, mltQueryString); + if (similarityQuery != null) { + qs.add(similarityQuery); + } + } } } else if (query.startsWith("spellcheck?")) { String spellcheckQueryString = query.replace("spellcheck?", ""); @@ -1842,4 +1858,57 @@ } } + private static Query getSimilarityQuery(List sp, IndexReader reader, String queryString) { + try { + LOG.info("parsing similarity query on {}", queryString); + Query similarityQuery = null; + String text = null; + for (String param : queryString.split("&")) { + String[] keyValuePair = param.split("="); + if (keyValuePair.length != 2 || keyValuePair[0] == null || keyValuePair[1] == null) { + throw new RuntimeException("Unparsable native Lucene MLT query for similarity: " + queryString); + } else { + if ("stream.body".equals(keyValuePair[0])) { + text = keyValuePair[1]; + break; + } + } + } + + if (text != null && !sp.isEmpty()) { + LOG.info("running sim query on {}", text); + BooleanQuery booleanQuery = new BooleanQuery(true); + BinaryMinHashAnalyzer analyzer = new BinaryMinHashAnalyzer(); + // retrieve input doc to compare with + IndexSearcher searcher = new IndexSearcher(reader); + TermQuery q = new TermQuery(new Term(FieldNames.PATH, text)); + TopDocs top = searcher.search(q, 1); + if (top.totalHits > 0) { + ScoreDoc d = top.scoreDocs[0]; + Document doc = reader.document(d.doc); + for (PropertyDefinition pd : sp) { + LOG.debug("adding sim clause for property {}", pd.name); + IndexableField indexableField = doc.getField(FieldNames.createBinSimilarityFieldName(pd.name)); + if (indexableField != null) { + LOG.trace("found binary property value for {}", pd.name); + BytesRef bytesRef = indexableField.binaryValue(); + String fvString = SimSearchUtils.toDoubleString(bytesRef.bytes); + String fieldName = FieldNames.createSimilarityFieldName(pd.name); + LOG.trace("generating sim query on field {} and text {}", fieldName, fvString); + BooleanQuery simQuery = SimSearchUtils.getSimQuery(analyzer, fieldName, fvString); + LOG.trace("sim query generated {}", simQuery); + booleanQuery.add(new BooleanClause(simQuery, SHOULD)); + } + } + } + similarityQuery = booleanQuery; + LOG.trace("final sim query is {}", similarityQuery); + } + + return similarityQuery; + } catch (Exception e) { + throw new RuntimeException("could not handle similarity query " + queryString); + } + } + } Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java (revision 1833679) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/PropertyDefinition.java (working copy) @@ -58,7 +58,7 @@ * property etc then it should be defined via 'name' property in NodeState. * In such case NodeState name can be set to anything */ - final String name; + public final String name; private final int propertyType; /** @@ -123,7 +123,9 @@ public final boolean unique; - public PropertyDefinition(IndexingRule idxDefn, String nodeName, NodeState defn) { + public boolean useInSimilarity; + + public PropertyDefinition(IndexingRule idxDefn, String nodeName, NodeState defn) { this.isRegexp = getOptionalValue(defn, PROP_IS_REGEX, false); this.name = getName(defn, nodeName); this.relative = isRelativeProperty(name); @@ -151,6 +153,7 @@ this.propertyType = getPropertyType(idxDefn, nodeName, defn); this.useInSuggest = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_USE_IN_SUGGEST, false); this.useInSpellcheck = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_USE_IN_SPELLCHECK, false); + this.useInSimilarity = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_USE_IN_SIMILARITY, false); this.nullCheckEnabled = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_NULL_CHECK_ENABLED, false); this.notNullCheckEnabled = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_NOT_NULL_CHECK_ENABLED, false); this.excludeFromAggregate = getOptionalValueIfIndexed(defn, LuceneIndexConstants.PROP_EXCLUDE_FROM_AGGREGATE, false); Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java (revision 1833679) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/IndexDefinitionBuilder.java (working copy) @@ -319,6 +319,11 @@ return this; } + public PropertyRule useInSimilarity() { + propTree.setProperty(LuceneIndexConstants.PROP_USE_IN_SIMILARITY, true); + return this; + } + public PropertyRule type(String type){ //This would throw an IAE if type is invalid PropertyType.valueFromName(type); Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/BinaryMinHashAnalyzer.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/BinaryMinHashAnalyzer.java (nonexistent) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/BinaryMinHashAnalyzer.java (working copy) @@ -0,0 +1,49 @@ +package org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch; + +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.util.Version; + +/** + * + */ +public class BinaryMinHashAnalyzer extends Analyzer { + + private static final int DEFAULT_SHINGLE_SIZE = 5; + + private final int min; + private final int max; + private final int hashCount; + private final int bucketCount; + private final int hashSetSize; + + private BinaryMinHashAnalyzer(int min, int max, int hashCount, int bucketCount, int hashSetSize) { + super(); + this.min = min; + this.max = max; + this.hashCount = hashCount; + this.bucketCount = bucketCount; + this.hashSetSize = hashSetSize; + } + + public BinaryMinHashAnalyzer() { + this(DEFAULT_SHINGLE_SIZE, DEFAULT_SHINGLE_SIZE, MinHashFilter.DEFAULT_HASH_COUNT, MinHashFilter.DEFAULT_BUCKET_COUNT, MinHashFilter.DEFAULT_HASH_SET_SIZE); + } + + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_47, reader); + ShingleFilter shingleFilter = new ShingleFilter(source, min, max); + shingleFilter.setTokenSeparator(" "); + shingleFilter.setOutputUnigrams(false); + shingleFilter.setOutputUnigramsIfNoShingles(false); + TokenStream filter = new MinHashFilter(shingleFilter, hashCount, bucketCount, hashSetSize, bucketCount > 1); + return new TokenStreamComponents(source, filter); + } + +} Property changes on: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/BinaryMinHashAnalyzer.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/MinHashFilter.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/MinHashFilter.java (nonexistent) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/MinHashFilter.java (working copy) @@ -0,0 +1,515 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.TreeSet; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; + +/** + * Generate min hash tokens from an incoming stream of tokens. The incoming tokens would typically be 5 word shingles. + * + * The number of hashes used and the number of minimum values for each hash can be set. You could have 1 hash and keep + * the 100 lowest values or 100 hashes and keep the lowest one for each. Hashes can also be bucketed in ranges over the + * 128-bit hash space, + * + * A 128-bit hash is used internally. 5 word shingles from 10e5 words generate 10e25 combinations So a 64 bit hash would + * have collisions (1.8e19) + * + * When using different hashes 32 bits are used for the hash position leaving scope for 8e28 unique hashes. A single + * hash will use all 128 bits. + * + */ +public class MinHashFilter extends TokenFilter { + private static final int HASH_CACHE_SIZE = 512; + + private static final LongPair[] cachedIntHashes = new LongPair[HASH_CACHE_SIZE]; + + public static final int DEFAULT_HASH_COUNT = 1; + + public static final int DEFAULT_HASH_SET_SIZE = 3; + + public static final int DEFAULT_BUCKET_COUNT = 512; + + private static final String MIN_HASH_TYPE = "MIN_HASH"; + + private final List>> minHashSets; + + private int hashSetSize = DEFAULT_HASH_SET_SIZE; + + private int bucketCount = DEFAULT_BUCKET_COUNT; + + private int hashCount = DEFAULT_HASH_COUNT; + + private boolean requiresInitialisation = true; + + private State endState; + + private int hashPosition = -1; + + private int bucketPosition = -1; + + private long bucketSize; + + private final boolean withRotation; + + private int endOffset; + + private boolean exhausted = false; + + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class); + private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); + private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class); + + static { + for (int i = 0; i < HASH_CACHE_SIZE; i++) { + cachedIntHashes[i] = new LongPair(); + murmurhash3_x64_128(getBytes(i), 0, 4, 0, cachedIntHashes[i]); + } + } + + static byte[] getBytes(int i) { + byte[] answer = new byte[4]; + answer[3] = (byte) (i); + answer[2] = (byte) (i >> 8); + answer[1] = (byte) (i >> 16); + answer[0] = (byte) (i >> 24); + return answer; + } + + /** + * create a MinHash filter + * + * @param input the token stream + * @param hashCount the no. of hashes + * @param bucketCount the no. of buckets for hashing + * @param hashSetSize the no. of min hashes to keep + * @param withRotation whether rotate or not hashes while incrementing tokens + */ + public MinHashFilter(TokenStream input, int hashCount, int bucketCount, int hashSetSize, boolean withRotation) { + super(input); + if (hashCount <= 0) { + throw new IllegalArgumentException("hashCount must be greater than zero"); + } + if (bucketCount <= 0) { + throw new IllegalArgumentException("bucketCount must be greater than zero"); + } + if (hashSetSize <= 0) { + throw new IllegalArgumentException("hashSetSize must be greater than zero"); + } + this.hashCount = hashCount; + this.bucketCount = bucketCount; + this.hashSetSize = hashSetSize; + this.withRotation = withRotation; + this.bucketSize = (1L << 32) / bucketCount; + if((1L << 32) % bucketCount != 0) + { + bucketSize++; + } + minHashSets = new ArrayList<>(this.hashCount); + for (int i = 0; i < this.hashCount; i++) { + ArrayList> buckets = new ArrayList<>(this.bucketCount); + minHashSets.add(buckets); + for (int j = 0; j < this.bucketCount; j++) { + FixedSizeTreeSet minSet = new FixedSizeTreeSet<>(this.hashSetSize); + buckets.add(minSet); + } + } + doRest(); + } + + @Override + public final boolean incrementToken() throws IOException { + // Pull the underlying stream of tokens + // Hash each token found + // Generate the required number of variants of this hash + // Keep the minimum hash value found so far of each variant + + int positionIncrement = 0; + if (requiresInitialisation) { + requiresInitialisation = false; + boolean found = false; + // First time through so we pull and hash everything + while (input.incrementToken()) { + found = true; + String current = new String(termAttribute.buffer(), 0, termAttribute.length()); + + for (int i = 0; i < hashCount; i++) { + byte[] bytes = current.getBytes("UTF-16LE"); + LongPair hash = new LongPair(); + murmurhash3_x64_128(bytes, 0, bytes.length, 0, hash); + LongPair rehashed = combineOrdered(hash, getIntHash(i)); + minHashSets.get(i).get((int) ((rehashed.val2 >>> 32) / bucketSize)).add(rehashed); + } + endOffset = offsetAttribute.endOffset(); + } + exhausted = true; + input.end(); + // We need the end state so an underlying shingle filter can have its state restored correctly. + endState = captureState(); + if (!found) { + return false; + } + + positionIncrement = 1; + // fix up any wrap around bucket values. ... + if (withRotation && (hashSetSize == 1)) { + for (int hashLoop = 0; hashLoop < hashCount; hashLoop++) { + for (int bucketLoop = 0; bucketLoop < bucketCount; bucketLoop++) { + if (minHashSets.get(hashLoop).get(bucketLoop).size() == 0) { + for (int bucketOffset = 1; bucketOffset < bucketCount; bucketOffset++) { + if (minHashSets.get(hashLoop).get((bucketLoop + bucketOffset) % bucketCount).size() > 0) { + LongPair replacementHash = minHashSets.get(hashLoop).get((bucketLoop + bucketOffset) % bucketCount) + .first(); + minHashSets.get(hashLoop).get(bucketLoop).add(replacementHash); + break; + } + } + } + } + } + } + + } + + clearAttributes(); + + while (hashPosition < hashCount) { + if (hashPosition == -1) { + hashPosition++; + } else { + while (bucketPosition < bucketCount) { + if (bucketPosition == -1) { + bucketPosition++; + } else { + LongPair hash = minHashSets.get(hashPosition).get(bucketPosition).pollFirst(); + if (hash != null) { + termAttribute.setEmpty(); + if (hashCount > 1) { + termAttribute.append(int0(hashPosition)); + termAttribute.append(int1(hashPosition)); + } + long high = hash.val2; + termAttribute.append(long0(high)); + termAttribute.append(long1(high)); + termAttribute.append(long2(high)); + termAttribute.append(long3(high)); + long low = hash.val1; + termAttribute.append(long0(low)); + termAttribute.append(long1(low)); + if (hashCount == 1) { + termAttribute.append(long2(low)); + termAttribute.append(long3(low)); + } + posIncAttribute.setPositionIncrement(positionIncrement); + offsetAttribute.setOffset(0, endOffset); + typeAttribute.setType(MIN_HASH_TYPE); + posLenAttribute.setPositionLength(1); + return true; + } else { + bucketPosition++; + } + } + + } + bucketPosition = -1; + hashPosition++; + } + } + return false; + } + + private static LongPair getIntHash(int i) { + if (i < HASH_CACHE_SIZE) { + return cachedIntHashes[i]; + } else { + LongPair answer = new LongPair(); + murmurhash3_x64_128(getBytes(i), 0, 4, 0, answer); + return answer; + } + } + + @Override + public void end() throws IOException { + if(!exhausted) { + input.end(); + } + + restoreState(endState); + } + + @Override + public void reset() throws IOException { + super.reset(); + doRest(); + } + + private void doRest() { + for (int i = 0; i < hashCount; i++) { + for (int j = 0; j < bucketCount; j++) { + minHashSets.get(i).get(j).clear(); + } + } + endState = null; + hashPosition = -1; + bucketPosition = -1; + requiresInitialisation = true; + exhausted = false; + } + + private static char long0(long x) { + return (char) (x >> 48); + } + + private static char long1(long x) { + return (char) (x >> 32); + } + + private static char long2(long x) { + return (char) (x >> 16); + } + + private static char long3(long x) { + return (char) (x); + } + + private static char int0(int x) { + return (char) (x >> 16); + } + + private static char int1(int x) { + return (char) (x); + } + + public static boolean isLessThanUnsigned(long n1, long n2) { + return (n1 < n2) ^ ((n1 < 0) != (n2 < 0)); + } + + static class FixedSizeTreeSet> extends TreeSet { + + /** + * + */ + private static final long serialVersionUID = -8237117170340299630L; + private final int capacity; + + FixedSizeTreeSet() { + this(20); + } + + FixedSizeTreeSet(int capacity) { + super(); + this.capacity = capacity; + } + + @Override + public boolean add(final E toAdd) { + if (capacity <= size()) { + final E lastElm = last(); + if (toAdd.compareTo(lastElm) > -1) { + return false; + } else { + pollLast(); + } + } + return super.add(toAdd); + } + } + + private static LongPair combineOrdered(LongPair... hashCodes) { + LongPair result = new LongPair(); + for (LongPair hashCode : hashCodes) { + result.val1 = result.val1 * 37 + hashCode.val1; + result.val2 = result.val2 * 37 + hashCode.val2; + + } + return result; + } + + /** 128 bits of state */ + static final class LongPair implements Comparable { + public long val1; + public long val2; + + /* + * (non-Javadoc) + * + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(LongPair other) { + if (isLessThanUnsigned(val2, other.val2)) { + return -1; + } else if (val2 == other.val2) { + if (isLessThanUnsigned(val1, other.val1)) { + return -1; + } else if (val1 == other.val1) { + return 0; + } else { + return 1; + } + } else { + return 1; + } + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + LongPair longPair = (LongPair) o; + + return val1 == longPair.val1 && val2 == longPair.val2; + + } + + @Override + public int hashCode() { + int result = (int) (val1 ^ (val1 >>> 32)); + result = 31 * result + (int) (val2 ^ (val2 >>> 32)); + return result; + } + } + + /** Gets a long from a byte buffer in little endian byte order. */ + private static long getLongLittleEndian(byte[] buf, int offset) { + return ((long) buf[offset + 7] << 56) // no mask needed + | ((buf[offset + 6] & 0xffL) << 48) + | ((buf[offset + 5] & 0xffL) << 40) + | ((buf[offset + 4] & 0xffL) << 32) + | ((buf[offset + 3] & 0xffL) << 24) + | ((buf[offset + 2] & 0xffL) << 16) + | ((buf[offset + 1] & 0xffL) << 8) + | ((buf[offset] & 0xffL)); // no shift needed + } + + /** Returns the MurmurHash3_x64_128 hash, placing the result in "out". */ + @SuppressWarnings("fallthrough") // the huge switch is designed to use fall through into cases! + static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) { + // The original algorithm does have a 32 bit unsigned seed. + // We have to mask to match the behavior of the unsigned types and prevent sign extension. + long h1 = seed & 0x00000000FFFFFFFFL; + long h2 = seed & 0x00000000FFFFFFFFL; + + final long c1 = 0x87c37b91114253d5L; + final long c2 = 0x4cf5ad432745937fL; + + int roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block + for (int i = offset; i < roundedEnd; i += 16) { + long k1 = getLongLittleEndian(key, i); + long k2 = getLongLittleEndian(key, i + 8); + k1 *= c1; + k1 = Long.rotateLeft(k1, 31); + k1 *= c2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + k2 *= c2; + k2 = Long.rotateLeft(k2, 33); + k2 *= c1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + } + + long k1 = 0; + long k2 = 0; + + switch (len & 15) { + case 15: + k2 = (key[roundedEnd + 14] & 0xffL) << 48; + case 14: + k2 |= (key[roundedEnd + 13] & 0xffL) << 40; + case 13: + k2 |= (key[roundedEnd + 12] & 0xffL) << 32; + case 12: + k2 |= (key[roundedEnd + 11] & 0xffL) << 24; + case 11: + k2 |= (key[roundedEnd + 10] & 0xffL) << 16; + case 10: + k2 |= (key[roundedEnd + 9] & 0xffL) << 8; + case 9: + k2 |= (key[roundedEnd + 8] & 0xffL); + k2 *= c2; + k2 = Long.rotateLeft(k2, 33); + k2 *= c1; + h2 ^= k2; + case 8: + k1 = ((long) key[roundedEnd + 7]) << 56; + case 7: + k1 |= (key[roundedEnd + 6] & 0xffL) << 48; + case 6: + k1 |= (key[roundedEnd + 5] & 0xffL) << 40; + case 5: + k1 |= (key[roundedEnd + 4] & 0xffL) << 32; + case 4: + k1 |= (key[roundedEnd + 3] & 0xffL) << 24; + case 3: + k1 |= (key[roundedEnd + 2] & 0xffL) << 16; + case 2: + k1 |= (key[roundedEnd + 1] & 0xffL) << 8; + case 1: + k1 |= (key[roundedEnd] & 0xffL); + k1 *= c1; + k1 = Long.rotateLeft(k1, 31); + k1 *= c2; + h1 ^= k1; + } + + // ---------- + // finalization + + h1 ^= len; + h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + out.val1 = h1; + out.val2 = h2; + } + + private static long fmix64(long k) { + k ^= k >>> 33; + k *= 0xff51afd7ed558ccdL; + k ^= k >>> 33; + k *= 0xc4ceb9fe1a85ec53L; + k ^= k >>> 33; + return k; + } +} Property changes on: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/MinHashFilter.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/SimSearchUtils.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/SimSearchUtils.java (nonexistent) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/SimSearchUtils.java (working copy) @@ -0,0 +1,94 @@ +package org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.TermQuery; + +/** + * + */ +public class SimSearchUtils { + + public static String toDoubleString(byte[] bytes) { + Double[] a = toDoubleArray(bytes); + StringBuilder builder = new StringBuilder(); + for (Double d : a) { + if (builder.length() > 0) { + builder.append(' '); + } + builder.append(d); + } + return builder.toString(); + } + + private static Double[] toDoubleArray(byte[] array) { + List doubles = toDoubles(array); + return doubles.toArray(new Double[doubles.size()]); + } + + public static List toDoubles(byte[] array) { + int blockSize = Double.SIZE / Byte.SIZE; + ByteBuffer wrap = ByteBuffer.wrap(array); + int capacity = array.length / blockSize; + List doubles = new ArrayList<>(capacity); + for (int i = 0; i < capacity; i++) { + double e = wrap.getDouble(i * blockSize); + doubles.add(e); + } + return doubles; + } + + private static Collection getTokens(Analyzer analyzer, String field, String sampleTextString) throws IOException { + Collection tokens = new LinkedList<>(); + TokenStream ts = analyzer.tokenStream(field, sampleTextString); + ts.reset(); + ts.addAttribute(CharTermAttribute.class); + while (ts.incrementToken()) { + CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class); + String token = new String(charTermAttribute.buffer(), 0, charTermAttribute.length()); + tokens.add(token); + } + ts.end(); + ts.close(); + return tokens; + } + + public static BooleanQuery getSimQuery(Analyzer analyzer, String fieldName, String text) throws IOException { + Collection tokens = getTokens(analyzer, fieldName, text); + BooleanQuery booleanQuery = new BooleanQuery(true); + for (String token : tokens) { + booleanQuery.add(new ConstantScoreQuery(new TermQuery(new Term(fieldName, token))), BooleanClause.Occur.SHOULD); + } + return booleanQuery; + } + + + public static byte[] toByteArray(List values) { + int blockSize = Double.SIZE / Byte.SIZE; + byte[] bytes = new byte[values.size() * blockSize]; + for (int i = 0, j = 0; i < values.size(); i++, j += blockSize) { + ByteBuffer.wrap(bytes, j, blockSize).putDouble(values.get(i)); + } + return bytes; + } + + public static byte[] toByteArray(String value) { + List doubles = new LinkedList<>(); + for (String dv : value.split(",")) { + doubles.add(Double.parseDouble(dv)); + } + return toByteArray(doubles); + } +} Property changes on: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/simsearch/SimSearchUtils.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java =================================================================== --- oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java (revision 1833679) +++ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/writer/IndexWriterUtils.java (working copy) @@ -20,6 +20,7 @@ package org.apache.jackrabbit.oak.plugins.index.lucene.writer; import java.util.HashMap; +import java.util.List; import java.util.Map; @@ -26,6 +27,8 @@ import org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames; import org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition; import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants; +import org.apache.jackrabbit.oak.plugins.index.lucene.PropertyDefinition; +import org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch.BinaryMinHashAnalyzer; import org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; @@ -51,6 +54,15 @@ Analyzer definitionAnalyzer = definition.getAnalyzer(); Map analyzers = new HashMap(); analyzers.put(FieldNames.SPELLCHECK, new ShingleAnalyzerWrapper(LuceneIndexConstants.ANALYZER, 3)); + for (IndexDefinition.IndexingRule r : definition.getDefinedRules()) { + List similarityProperties = r.getSimilarityProperties(); + for (PropertyDefinition pd : similarityProperties) { + if (pd.useInSimilarity) { + analyzers.put(FieldNames.createSimilarityFieldName(pd.name), new BinaryMinHashAnalyzer()); + } + } + } + if (!definition.isSuggestAnalyzed()) { analyzers.put(FieldNames.SUGGEST, SuggestHelper.getAnalyzer()); } Index: oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java =================================================================== --- oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (revision 1833679) +++ oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (working copy) @@ -19,66 +19,16 @@ package org.apache.jackrabbit.oak.plugins.index.lucene; -import javax.annotation.Nonnull; -import javax.jcr.PropertyType; - -import static com.google.common.collect.ImmutableSet.of; -import static com.google.common.collect.Lists.newArrayList; -import static java.util.Arrays.asList; -import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT; -import static org.apache.jackrabbit.JcrConstants.JCR_DATA; -import static org.apache.jackrabbit.JcrConstants.NT_FILE; -import static org.apache.jackrabbit.oak.api.QueryEngine.NO_BINDINGS; -import static org.apache.jackrabbit.oak.api.QueryEngine.NO_MAPPINGS; -import static org.apache.jackrabbit.oak.api.Type.NAMES; -import static org.apache.jackrabbit.oak.api.Type.STRING; -import static org.apache.jackrabbit.oak.api.Type.STRINGS; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.ASYNC_PROPERTY_NAME; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.DECLARING_NODE_TYPES; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NAME; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NODE_TYPE; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.QUERY_PATHS; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.REINDEX_PROPERTY_NAME; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME; -import static org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.INDEX_DEFINITION_NODE; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ANALYZERS; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INCLUDE_PROPERTY_NAMES; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ORDERED_PROP_NAMES; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INDEX_ORIGINAL_TERM; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROPDEF_PROP_NODE_NAME; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_ANALYZED; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NAME; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE_SCOPE_INDEX; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_PROPERTY_INDEX; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_TYPE; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.TIKA; -import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorTest.createCal; -import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.child; -import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.newNodeAggregator; -import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.useV2; -import static org.apache.jackrabbit.oak.plugins.index.property.OrderedIndex.OrderDirection; -import static org.apache.jackrabbit.oak.plugins.memory.PropertyStates.createProperty; -import static org.apache.jackrabbit.oak.InitialContentHelper.INITIAL_CONTENT; -import static org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_EXCLUDED_PATHS; -import static org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_INCLUDED_PATHS; -import static org.hamcrest.CoreMatchers.containsString; -import static org.hamcrest.CoreMatchers.not; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertThat; -import static org.junit.Assert.assertTrue; - +import java.io.ByteArrayInputStream; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.net.URI; +import java.nio.charset.Charset; import java.text.ParseException; import java.util.Calendar; import java.util.Collections; -import java.util.Comparator; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -88,12 +38,14 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import javax.annotation.Nonnull; +import javax.jcr.PropertyType; + import com.google.common.base.Charsets; import com.google.common.collect.ComparisonChain; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.io.CountingInputStream; @@ -100,6 +52,7 @@ import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.jackrabbit.JcrConstants; +import org.apache.jackrabbit.oak.InitialContent; import org.apache.jackrabbit.oak.Oak; import org.apache.jackrabbit.oak.api.Blob; import org.apache.jackrabbit.oak.api.CommitFailedException; @@ -120,6 +73,7 @@ import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider; import org.apache.jackrabbit.oak.plugins.index.lucene.directory.CopyOnReadDirectory; import org.apache.jackrabbit.oak.plugins.index.lucene.util.IndexDefinitionBuilder; +import org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch.SimSearchUtils; import org.apache.jackrabbit.oak.plugins.index.nodetype.NodeTypeIndexProvider; import org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider; import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob; @@ -126,7 +80,6 @@ import org.apache.jackrabbit.oak.plugins.memory.MemoryNodeStore; import org.apache.jackrabbit.oak.plugins.memory.PropertyStates; import org.apache.jackrabbit.oak.plugins.nodetype.TypeEditorProvider; -import org.apache.jackrabbit.oak.InitialContent; import org.apache.jackrabbit.oak.plugins.nodetype.write.NodeTypeRegistry; import org.apache.jackrabbit.oak.query.AbstractQueryTest; import org.apache.jackrabbit.oak.spi.commit.CommitInfo; @@ -147,6 +100,56 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; +import static com.google.common.collect.ImmutableSet.of; +import static com.google.common.collect.Lists.newArrayList; +import static java.util.Arrays.asList; +import static org.apache.jackrabbit.JcrConstants.JCR_CONTENT; +import static org.apache.jackrabbit.JcrConstants.JCR_DATA; +import static org.apache.jackrabbit.JcrConstants.NT_FILE; +import static org.apache.jackrabbit.oak.InitialContentHelper.INITIAL_CONTENT; +import static org.apache.jackrabbit.oak.api.QueryEngine.NO_BINDINGS; +import static org.apache.jackrabbit.oak.api.QueryEngine.NO_MAPPINGS; +import static org.apache.jackrabbit.oak.api.Type.NAMES; +import static org.apache.jackrabbit.oak.api.Type.STRING; +import static org.apache.jackrabbit.oak.api.Type.STRINGS; +import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.ASYNC_PROPERTY_NAME; +import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.DECLARING_NODE_TYPES; +import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NAME; +import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NODE_TYPE; +import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.QUERY_PATHS; +import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.REINDEX_PROPERTY_NAME; +import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME; +import static org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition.INDEX_DEFINITION_NODE; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ANALYZERS; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INCLUDE_PROPERTY_NAMES; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INDEX_ORIGINAL_TERM; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.ORDERED_PROP_NAMES; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROPDEF_PROP_NODE_NAME; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_ANALYZED; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NAME; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_NODE_SCOPE_INDEX; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_PROPERTY_INDEX; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PROP_TYPE; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.TIKA; +import static org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorTest.createCal; +import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.child; +import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.newNodeAggregator; +import static org.apache.jackrabbit.oak.plugins.index.lucene.TestUtil.useV2; +import static org.apache.jackrabbit.oak.plugins.index.property.OrderedIndex.OrderDirection; +import static org.apache.jackrabbit.oak.plugins.memory.PropertyStates.createProperty; +import static org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_EXCLUDED_PATHS; +import static org.apache.jackrabbit.oak.spi.filter.PathFilter.PROP_INCLUDED_PATHS; +import static org.hamcrest.CoreMatchers.containsString; +import static org.hamcrest.CoreMatchers.not; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; + @SuppressWarnings("ArraysAsListWithZeroOrOneArgument") public class LucenePropertyIndexTest extends AbstractQueryTest { /** @@ -2946,6 +2949,71 @@ "lucene:test1(/oak:index/test1)", asList("/d")); } + @Test + public void testRepSimilarWithBinaries() throws Exception { + IndexDefinitionBuilder idxb = new IndexDefinitionBuilder().noAsync(); + idxb.indexRule("nt:base").property("fv").useInSimilarity().nodeScopeIndex().propertyIndex(); + + Tree idx = root.getTree("/").getChild("oak:index").addChild("test1"); + idxb.build(idx); + root.commit(); + + String query = "select [jcr:path] from [nt:base] where similar(., '/test/i')"; + Tree test = root.getTree("/").addChild("test"); + + URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI(); + File file = new File(uri); + + for (String line : IOUtils.readLines(new FileInputStream(file), Charset.defaultCharset())) { + String[] split = line.split(","); + List values = new LinkedList<>(); + int i = 0; + for (String s : split) { + if (i > 0) { + values.add(Double.parseDouble(s)); + } + i++; + } + + byte[] bytes = SimSearchUtils.toByteArray(values); + List actual = SimSearchUtils.toDoubles(bytes); + assertEquals(values, actual); + + Blob blob = root.createBlob(new ByteArrayInputStream(bytes)); + String name = split[0]; + test.addChild(name).setProperty("fv", blob, Type.BINARY); + } + root.commit(); + + Iterator result = executeQuery(query, "JCR-SQL2").iterator(); + + while (result.hasNext()) { + String next = result.next(); + System.err.println(next); + } + +// assertTrue(result.hasNext()); +// assertEquals("/test/i", result.next()); +// assertTrue(result.hasNext()); +// assertEquals("/test/e", result.next()); + + +// assertTrue(result.hasNext()); +// assertTrue(result.hasNext()); +// assertEquals("/test/d", result.next()); +// assertTrue(result.hasNext()); +// assertEquals("/test/e", result.next()); +// assertTrue(result.hasNext()); +// assertEquals("/test/f", result.next()); +// assertTrue(result.hasNext()); +// assertEquals("/test/g", result.next()); +// assertTrue(result.hasNext()); +// assertEquals("/test/h", result.next()); +// assertTrue(result.hasNext()); +// assertEquals("/test/i", result.next()); + assertFalse(result.hasNext()); + } + private void assertPlanAndQuery(String query, String planExpectation, List paths) { assertPlanAndQuery(query, planExpectation, paths, false); } Index: oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/SimSearchAnalyzerTest.java =================================================================== --- oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/SimSearchAnalyzerTest.java (nonexistent) +++ oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/SimSearchAnalyzerTest.java (working copy) @@ -0,0 +1,115 @@ +package org.apache.jackrabbit.oak.plugins.index.lucene.util; + +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +import org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch.BinaryMinHashAnalyzer; +import org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch.SimSearchUtils; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Version; +import org.junit.Test; + +import static org.apache.jackrabbit.oak.plugins.index.lucene.util.simsearch.SimSearchUtils.getSimQuery; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; + +/** + * + */ +public class SimSearchAnalyzerTest { + + @Test + public void testTextHandling() throws Exception { + BinaryMinHashAnalyzer analyzer = new BinaryMinHashAnalyzer(); + String text = "the big brown fox jump over the lazy dog"; + TokenStream stream = analyzer.tokenStream("", text); + stream.reset(); + stream.addAttribute(CharTermAttribute.class); + int i = 0; + Collection outputs = new LinkedList<>(); + while (stream.incrementToken()) { + CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); + String token = new String(termAttribute.buffer(), 0, termAttribute.length()); + assertNotNull(token); + outputs.add(token); + i++; + } + stream.close(); + assertFalse(outputs.isEmpty()); + + Directory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_47, analyzer)); + + Document document = new Document(); + String fieldName = "text"; + document.add(new TextField(fieldName, text, Field.Store.YES)); + writer.addDocument(document); + writer.commit(); + + DirectoryReader reader = DirectoryReader.open(writer, false); + IndexSearcher searcher = new IndexSearcher(reader); + Query booleanQuery = getSimQuery(analyzer, fieldName, text); + TopDocs topDocs = searcher.search(booleanQuery, 1); + assertEquals(1, topDocs.totalHits); + } + + @Test + public void testBinaryHandling() throws Exception { + BinaryMinHashAnalyzer analyzer = new BinaryMinHashAnalyzer(); + Directory directory = new RAMDirectory(); + IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_47, analyzer)); + DirectoryReader reader = null; + try { + + List values = new LinkedList<>(); + values.add(0.1d); + values.add(0.3d); + values.add(0.5d); + values.add(0.7d); + values.add(0.9d); + values.add(0.11d); + values.add(0.13d); + values.add(0.17d); + values.add(0.19d); + values.add(0.23d); + values.add(0.29d); + + byte[] bytes = SimSearchUtils.toByteArray(values); + String fvString = SimSearchUtils.toDoubleString(bytes); + + String fieldName = "fvs"; + Document document = new Document(); + document.add(new TextField(fieldName, fvString, Field.Store.NO)); + writer.addDocument(document); + writer.commit(); + + + reader = DirectoryReader.open(writer, false); + IndexSearcher searcher = new IndexSearcher(reader); + Query booleanQuery = getSimQuery(analyzer, fieldName, fvString); + TopDocs topDocs = searcher.search(booleanQuery, 1); + assertEquals(1, topDocs.totalHits); + } finally { + if (reader != null) { + reader.close(); + } + writer.close(); + directory.close(); + } + } + +} \ No newline at end of file Property changes on: oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/SimSearchAnalyzerTest.java ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property