From 1c4a14f8fde3526775caecd15befe11b82fce6d9 Mon Sep 17 00:00:00 2001
From: Vikas Saurabh <vsaurabh@adobe.com>
Date: Fri, 30 Nov 2018 15:55:12 +0530
Subject: [PATCH 1/2] OAK-7930: Add tape sampling

---
 .../index/lucene/util/TapeSampling.java       |  82 ++++++++++++
 .../index/lucene/util/TapeSamplingTest.java   | 119 ++++++++++++++++++
 2 files changed, 201 insertions(+)
 create mode 100644 oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSampling.java
 create mode 100644 oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSamplingTest.java

diff --git a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSampling.java b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSampling.java
new file mode 100644
index 0000000000..df98ee1679
--- /dev/null
+++ b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSampling.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.AbstractIterator;
+
+import java.util.Iterator;
+import java.util.Random;
+
+/**
+ * Sampling algorithm that picks 'k' random samples from streaming input.
+ * The algorithm would maintain 'k/N' probability to pick any of the item
+ * where 'N' is the number of items seen currently.
+ *
+ * While the input could be streaming, the algorithm requires {@code N} to be known
+ * before hand.
+ *
+ * The algorithm produces random saamples without replacement and hence has O(1) extra
+ * memory complexity
+ *
+ * Implementation inspired from "JONES,T.G. A note on sampling a tape file"
+ * (https://dl.acm.org/citation.cfm?id=368159)
+ */
+public class TapeSampling<T> {
+    private final Random rGen;
+    private final Iterator<T> input;
+    private final int N;
+    private final int k;
+
+    public TapeSampling(final Random rGen, final Iterator<T> input, final int N, final int k) {
+        this.rGen = rGen;
+        this.input = input;
+        this.N = N;
+        this.k = k;
+    }
+
+    public Iterator<T> getSamples() {
+        return new AbstractIterator<T>() {
+            int sampled = 0;
+            int seen = 0;
+
+            @Override
+            protected T computeNext() {
+                if (sampled == k) {
+                    return endOfData();
+                }
+
+                while (true) {
+                    Preconditions.checkArgument(input.hasNext(),
+                            "Not enough input items provided. Declared: " + N + "; got " + seen + "; sampled: " + sampled);
+
+                    T i = input.next();
+
+                    int r = rGen.nextInt(N - seen) + 1;
+                    seen++;
+
+                    if (r <= k - sampled) {
+                        sampled++;
+                        return i;
+                    }
+                }
+            }
+        };
+    }
+}
diff --git a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSamplingTest.java b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSamplingTest.java
new file mode 100644
index 0000000000..84523a6ae5
--- /dev/null
+++ b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSamplingTest.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util;
+
+import com.google.common.collect.AbstractIterator;
+import com.google.common.collect.Iterators;
+import org.junit.Test;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import static com.google.common.collect.Lists.newArrayList;
+import static org.junit.Assert.assertEquals;
+
+public class TapeSamplingTest {
+    @Test
+    public void testWithHighestRandom() {
+        final int start = 10;
+        final int end = 30;
+        final int k = 10;
+        final Random r = new Random() {
+            @Override
+            public int nextInt(int i) {
+                return i - 1;
+            }
+        };
+
+        List<Integer> input = range(start, end);
+        TapeSampling<Integer> res = new TapeSampling<>(r, input.iterator(), input.size(), k);
+
+        List<Integer> samples = newArrayList(res.getSamples());
+        List<Integer> expected = range(end - k + 1, end);
+
+        assertEquals(expected, samples);
+    }
+
+    @Test
+    public void testWithLowestRandom() {
+        final int start = 10;
+        final int end = 30;
+        final int k = 10;
+        final Random r = new Random() {
+            @Override
+            public int nextInt(int i) {
+                return 0;
+            }
+        };
+
+        List<Integer> input = range(start, end);
+        TapeSampling<Integer> res = new TapeSampling<>(r, input.iterator(), input.size(), k);
+
+        List<Integer> samples = newArrayList(res.getSamples());
+        List<Integer> expected = range(start, start + k - 1);
+
+        assertEquals(expected, samples);
+    }
+
+    @Test
+    public void allItemsWhenKisN() {
+        final int start = 11;
+        final int end = 20;
+        final int k = 10;
+        final Random r = new Random();
+
+        List<Integer> input = range(start, end);
+        TapeSampling<Integer> res = new TapeSampling<>(r, input.iterator(), input.size(), k);
+
+        List<Integer> samples = newArrayList(res.getSamples());
+        List<Integer> expected = input;
+
+        assertEquals(expected, samples);
+    }
+
+    @Test
+    public void sampleExactlyK() {
+        final int start = 11;
+        final int end = 1000;
+        final int k = 10;
+        final Random r = new Random();
+
+        List<Integer> input = range(start, end);
+        TapeSampling<Integer> res = new TapeSampling<>(r, input.iterator(), input.size(), k);
+
+        assertEquals("Must sample exactly " + k + " items", k, Iterators.size(res.getSamples()));
+    }
+
+    private List<Integer> range(final int start, final int end) {
+        Iterator<Integer> iter = new AbstractIterator<Integer>() {
+            int curr = start;
+            @Override
+            protected Integer computeNext() {
+                if (curr > end) {
+                    return endOfData();
+                }
+
+                return curr++;
+            }
+        };
+
+        return newArrayList(iter);
+    }
+}
-- 
2.18.0

