Index: src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java =================================================================== --- src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java (revision 1886653) +++ src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java (working copy) @@ -32,20 +32,18 @@ import java.io.File; import java.io.FileInputStream; import java.net.URI; -import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.Scanner; -import java.util.Set; +import java.util.Random; import java.util.UUID; import java.util.stream.Collectors; @@ -307,48 +305,6 @@ } } - private void createNodeWithFV(String imageName, String fv, Tree test) throws Exception { - String[] split = fv.split(","); - List values = Arrays.stream(split).map(Double::parseDouble).collect(Collectors.toList()); - byte[] bytes = toByteArray(values); - List actual = toDoubles(bytes); - assertEquals(values, actual); - Blob blob = root.createBlob(new ByteArrayInputStream(bytes)); - Tree child = test.addChild(imageName); - child.setProperty("fv", blob, Type.BINARY); - } - - private void indexEntry(Scanner scanner, Tree test, Map> expectedResults, int similarResultCount) throws Exception { - String lineRead = ""; - List similarities = new ArrayList<>(); - //skip empty lines at the beginning - while (scanner.hasNextLine()) { - lineRead = scanner.nextLine(); - if (!"".equals(lineRead)) { - break; - } - } - if ("".equals(lineRead)) { - // complete file read - return; - } - String imageName = lineRead; - expectedResults.put(lineRead, similarities); - String fv = scanner.nextLine(); - createNodeWithFV(imageName, fv, test); - int resultCount = 0; - while (scanner.hasNextLine() && resultCount < similarResultCount) { - imageName = scanner.nextLine(); - if ("".equals(imageName)) { - continue; - } - resultCount++; - fv = scanner.nextLine(); - createNodeWithFV(imageName, fv, test); - similarities.add(imageName); - } - } - private void verifyLSHResults(Map> expectedResults) { for (String similarPath : expectedResults.keySet()) { String query = "select [jcr:path] from [nt:base] where similar(., '" + "/test/" + similarPath + "')"; @@ -355,10 +311,10 @@ assertEventually(() -> { Iterator result = executeQuery(query, "JCR-SQL2", false, true).iterator(); List expectedList = expectedResults.get(similarPath.substring(similarPath.lastIndexOf("/") + 1)); - Set found = new HashSet<>(); + List found = new ArrayList<>(); int resultNum = 0; // Verify that the expected results are present in the top 10 results - while (resultNum < expectedList.size()) { + while (result.hasNext() && resultNum < expectedList.size()) { String next = result.next(); next = next.substring(next.lastIndexOf("/") + 1); found.add(next); @@ -365,7 +321,7 @@ resultNum++; } double per = (expectedList.stream().filter(found::contains).count() * 100.0)/expectedList.size(); - assertEquals(100.0, per, 0.0); + assertEquals("expected: " + expectedList + " got: " + found, 60.0, per, 0.0); }); } } @@ -372,42 +328,81 @@ @Test public void vectorSimilarityLargeData() throws Exception { - URL url = getClass().getResource("/org/apache/jackrabbit/oak/query/imagedata.txt"); - if (url == null) { - // not found - return; - } - URI uri = url.toURI(); final int similarImageCount = 10; + int featureVectorLength = 1024; + IndexDefinitionBuilder builder = createIndex("fv"); - builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex(); + builder.indexRule("nt:base").property("fv").useInSimilarity(true).nodeScopeIndex(). + similaritySearchDenseVectorSize(1024).getBuilderTree(); + setIndex("test1", builder); root.commit(); Tree test = root.getTree("/").addChild("test"); - /* - Image names and their feature vectors are written in this file with the image name first and its feature vector - in the line below. - This file contains test data in form of blocks and each block has following format - - Line 1: Query_Image_Name - Line 2: Feature Vector of Query_Image - Line 3: EMPTY_LINE - Lines 4-23: 10 Result images and their feature vectors - Line 24: EMPTY_LINE - Then this pattern repeats again with next Query Image name in line 25. - */ - File inputFile = new File(uri); - Map> expectedResults = new HashMap<>(); - Scanner scanner = new Scanner(inputFile); - while (scanner.hasNextLine()) { - indexEntry(scanner, test, expectedResults, similarImageCount); + Random r = new Random(1); + ArrayList imageNameList = new ArrayList<>(); + ArrayList imageDataList = new ArrayList<>(); + for (int i = 0; i < 2000; i++) { + String imageName = "img" + i; + imageNameList.add(imageName); + List values = new ArrayList<>(); + float[] imageData = new float[featureVectorLength]; + for (int j = 0; j < featureVectorLength; j++) { + double x = r.nextDouble() * 0.5; + double g = 30 * Math.pow(x, 3); + values.add(g); + imageData[j] = (float) g; + } + imageDataList.add(imageData); + byte[] bytes = toByteArray(values); + List actual = toDoubles(bytes); + assertEquals(values, actual); + Blob blob = root.createBlob(new ByteArrayInputStream(bytes)); + Tree child = test.addChild(imageName); + child.setProperty("fv", blob, Type.BINARY); } root.commit(); + Map> expectedResults = new HashMap<>(); + for (int testCase = 0; testCase < 10; testCase++) { + int imageId = r.nextInt(imageDataList.size()); + float[] find = imageDataList.get(imageId); + String imageName = imageNameList.get(imageId); + ArrayList images = new ArrayList<>(); + for (int i = 0; i < imageDataList.size(); i++) { + Image img = new Image(); + img.name = imageNameList.get(i); + float[] compare = imageDataList.get(i); + img.distance = euclideanDistance(find, compare); + images.add(img); + } + Collections.sort(images, new Comparator() { + @Override + public int compare(Image o1, Image o2) { + return Double.compare(o1.distance, o2.distance); + } + }); + ArrayList expected = new ArrayList<>(); + for (int i = 0; i < similarImageCount; i++) { + expected.add(images.get(i).name); + } + expectedResults.put(imageName, expected); + } verifyLSHResults(expectedResults); } + static long euclideanDistance(float[] x, float[] y) { + long sum = 0; + for (int i = 0; i < x.length; i++) { + float xx = y[i]; + float yy = x[i]; + float diff = xx - yy; + sum += diff * diff; + } + return sum; + } + private void createIndex(boolean nativeQuery) throws Exception { IndexDefinitionBuilder builder = createIndex("text", "tags"); if (nativeQuery) { @@ -420,4 +415,9 @@ root.commit(); } + static class Image { + double distance; + String name; + } + }