diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java index 9d63374a12..363a51b5b9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/StoreScanner.java @@ -552,7 +552,6 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner prevCell = cell; topChanged = false; ScanQueryMatcher.MatchCode qcode = matcher.match(cell); - qcode = optimize(qcode, cell); switch(qcode) { case INCLUDE: case INCLUDE_AND_SEEK_NEXT_ROW: @@ -575,7 +574,7 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner // the heap.peek() will any way be in the next row. So the SQM.match(cell) need do // another compareRow to say the current row is DONE matcher.row = null; - seekToNextRow(cell); + seekOrSkipToNextRow(cell); break LOOP; } @@ -606,9 +605,9 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner // the heap.peek() will any way be in the next row. So the SQM.match(cell) need do // another compareRow to say the current row is DONE matcher.row = null; - seekToNextRow(cell); + seekOrSkipToNextRow(cell); } else if (qcode == ScanQueryMatcher.MatchCode.INCLUDE_AND_SEEK_NEXT_COL) { - seekAsDirection(matcher.getKeyForNextColumn(cell)); + seekOrSkipToNextColumn(cell); } else { this.heap.next(); } @@ -648,7 +647,7 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner // the heap.peek() will any way be in the next row. So the SQM.match(cell) need do // another compareRow to say the current row is DONE matcher.row = null; - seekToNextRow(cell); + seekOrSkipToNextRow(cell); NextState stateAfterSeekNextRow = needToReturn(outResult); if (stateAfterSeekNextRow != null) { return scannerContext.setScannerState(stateAfterSeekNextRow).hasMoreValues(); @@ -656,7 +655,7 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner break; case SEEK_NEXT_COL: - seekAsDirection(matcher.getKeyForNextColumn(cell)); + seekOrSkipToNextColumn(cell); NextState stateAfterSeekNextColumn = needToReturn(outResult); if (stateAfterSeekNextColumn != null) { return scannerContext.setScannerState(stateAfterSeekNextColumn).hasMoreValues(); @@ -712,94 +711,7 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner } return null; } - - /** - * See if we should actually SEEK or rather just SKIP to the next Cell (see HBASE-13109). - * This method works together with ColumnTrackers and Filters. ColumnTrackers may issue SEEK - * hints, such as seek to next column, next row, or seek to an arbitrary seek key. - * This method intercepts these qcodes and decides whether a seek is the most efficient _actual_ - * way to get us to the requested cell (SEEKs are more expensive than SKIP, SKIP, SKIP inside the - * current, loaded block). - * It does this by looking at the next indexed key of the current HFile. This key - * is then compared with the _SEEK_ key, where a SEEK key is an artificial 'last possible key - * on the row' (only in here, we avoid actually creating a SEEK key; in the compare we work with - * the current Cell but compare as though it were a seek key; see down in - * matcher.compareKeyForNextRow, etc). If the compare gets us onto the - * next block we *_SEEK, otherwise we just INCLUDE or SKIP, and let the ColumnTrackers or Filters - * go through the next Cell, and so on) - * - *

The ColumnTrackers and Filters must behave correctly in all cases, i.e. if they are past the - * Cells they care about they must issues a SKIP or SEEK. - * - *

Other notes: - *

- *

A good proxy (best effort) to determine whether INCLUDE/SKIP is better than SEEK is whether - * we'll likely end up seeking to the next block (or past the next block) to get our next column. - * Example: - *

-   * |    BLOCK 1              |     BLOCK 2                   |
-   * |  r1/c1, r1/c2, r1/c3    |    r1/c4, r1/c5, r2/c1        |
-   *                                   ^         ^
-   *                                   |         |
-   *                           Next Index Key   SEEK_NEXT_ROW (before r2/c1)
-   *
-   *
-   * |    BLOCK 1                       |     BLOCK 2                      |
-   * |  r1/c1/t5, r1/c1/t4, r1/c1/t3    |    r1/c1/t2, r1/c1/T1, r1/c2/T3  |
-   *                                            ^              ^
-   *                                            |              |
-   *                                    Next Index Key        SEEK_NEXT_COL
-   * 
- * Now imagine we want columns c1 and c3 (see first diagram above), the 'Next Index Key' of r1/c4 - * is > r1/c3 so we should seek to get to the c1 on the next row, r2. In second case, say we only - * want one version of c1, after we have it, a SEEK_COL will be issued to get to c2. Looking at - * the 'Next Index Key', it would land us in the next block, so we should SEEK. In other scenarios - * where the SEEK will not land us in the next block, it is very likely better to issues a series - * of SKIPs. - */ - @VisibleForTesting - protected ScanQueryMatcher.MatchCode optimize(ScanQueryMatcher.MatchCode qcode, Cell cell) { - switch(qcode) { - case INCLUDE_AND_SEEK_NEXT_COL: - case SEEK_NEXT_COL: - { - Cell nextIndexedKey = getNextIndexedKey(); - if (nextIndexedKey != null && nextIndexedKey != KeyValueScanner.NO_NEXT_INDEXED_KEY - && matcher.compareKeyForNextColumn(nextIndexedKey, cell) >= 0) { - return qcode == MatchCode.SEEK_NEXT_COL ? MatchCode.SKIP : MatchCode.INCLUDE; - } - break; - } - case INCLUDE_AND_SEEK_NEXT_ROW: - case SEEK_NEXT_ROW: - { - // If it is a Get Scan, then we know that we are done with this row; there are no more - // rows beyond the current one: don't try to optimize. We are DONE. Return the *_NEXT_ROW - // qcode as is. When the caller gets these flags on a Get Scan, it knows it can shut down the - // Scan. - if (!this.scan.isGetScan()) { - Cell nextIndexedKey = getNextIndexedKey(); - if (nextIndexedKey != null && nextIndexedKey != KeyValueScanner.NO_NEXT_INDEXED_KEY - && matcher.compareKeyForNextRow(nextIndexedKey, cell) > 0) { - return qcode == MatchCode.SEEK_NEXT_ROW ? MatchCode.SKIP : MatchCode.INCLUDE; - } - } - break; - } - default: - break; - } - return qcode; - } - + @Override public long getReadPoint() { return readPt; @@ -836,7 +748,7 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner // these scanners are properly closed() whether or not the scan is completed successfully // Eagerly creating scanners so that we have the ref counting ticking on the newly created // store files. In case of stream scanners this eager creation does not induce performance - // penalty because in scans (that uses stream scanners) the next() call is bound to happen. + // penalty because in scans (that uses stream scanners) the next() call is bound to happen. List scanners = store.getScanners(sfs, cacheBlocks, get, usePread, isCompaction, matcher, scan.getStartRow(), scan.getStopRow(), this.readPt, false); flushedstoreFileScanners.addAll(scanners); @@ -872,6 +784,130 @@ public class StoreScanner extends NonReversedNonLazyKeyValueScanner // Let the next() call handle re-creating and seeking } + private void seekOrSkipToNextRow(Cell cell) throws IOException { + // If it is a Get Scan, then we know that we are done with this row; there are no more + // rows beyond the current one: don't try to optimize. + if (!get) { + if (trySkipToNextRow(cell)) { + return; + } + } + seekToNextRow(cell); + } + + private void seekOrSkipToNextColumn(Cell cell) throws IOException { + if (!trySkipToNextColumn(cell)) { + seekAsDirection(matcher.getKeyForNextColumn(cell)); + } + } + + /** + * See if we should actually SEEK or rather just SKIP to the next Cell (see HBASE-13109). + * ScanQueryMatcher may issue SEEK hints, such as seek to next column, next row, + * or seek to an arbitrary seek key. This method decides whether a seek is the most efficient + * _actual_ way to get us to the requested cell (SEEKs are more expensive than SKIP, SKIP, + * SKIP inside the current, loaded block). + * It does this by looking at the next indexed key of the current HFile. This key + * is then compared with the _SEEK_ key, where a SEEK key is an artificial 'last possible key + * on the row' (only in here, we avoid actually creating a SEEK key; in the compare we work with + * the current Cell but compare as though it were a seek key; see down in + * matcher.compareKeyForNextRow, etc). If the compare gets us onto the + * next block we *_SEEK, otherwise we just SKIP to the next requested cell. + * + *

Other notes: + *

+ *

A good proxy (best effort) to determine whether SKIP is better than SEEK is whether + * we'll likely end up seeking to the next block (or past the next block) to get our next column. + * Example: + *

+   * |    BLOCK 1              |     BLOCK 2                   |
+   * |  r1/c1, r1/c2, r1/c3    |    r1/c4, r1/c5, r2/c1        |
+   *                                   ^         ^
+   *                                   |         |
+   *                           Next Index Key   SEEK_NEXT_ROW (before r2/c1)
+   *
+   *
+   * |    BLOCK 1                       |     BLOCK 2                      |
+   * |  r1/c1/t5, r1/c1/t4, r1/c1/t3    |    r1/c1/t2, r1/c1/T1, r1/c2/T3  |
+   *                                            ^              ^
+   *                                            |              |
+   *                                    Next Index Key        SEEK_NEXT_COL
+   * 
+ * Now imagine we want columns c1 and c3 (see first diagram above), the 'Next Index Key' of r1/c4 + * is > r1/c3 so we should seek to get to the c1 on the next row, r2. In second case, say we only + * want one version of c1, after we have it, a SEEK_COL will be issued to get to c2. Looking at + * the 'Next Index Key', it would land us in the next block, so we should SEEK. In other scenarios + * where the SEEK will not land us in the next block, it is very likely better to issues a series + * of SKIPs. + * @param cell current cell + * @return true means skip to next row, false means not + */ + @VisibleForTesting + protected boolean trySkipToNextRow(Cell cell) throws IOException { + Cell nextCell = null; + // used to guard against a changed next indexed key by doing a identity comparison + // when the identity changes we need to compare the bytes again + Cell previousIndexedKey = null; + do { + Cell nextIndexedKey = getNextIndexedKey(); + if (nextIndexedKey != null && nextIndexedKey != KeyValueScanner.NO_NEXT_INDEXED_KEY && + (nextIndexedKey == previousIndexedKey || + matcher.compareKeyForNextRow(nextIndexedKey, cell) >= 0)) { + this.heap.next(); + ++kvsScanned; + previousIndexedKey = nextIndexedKey; + } else { + return false; + } + } while ((nextCell = this.heap.peek()) != null && CellUtil.matchingRow(cell, nextCell)); + return true; + } + + /** + * See {@link org.apache.hadoop.hbase.regionserver.StoreScanner#trySkipToNextRow(Cell)} + * @param cell current cell + * @return true means skip to next column, false means not + */ + + /** + * @param cell current cell + * @return true means skip to next column, false means not + */ + @VisibleForTesting + protected boolean trySkipToNextColumn(Cell cell) throws IOException { + Cell nextCell = null; + // used to guard against a changed next indexed key by doing a identity comparison + // when the identity changes we need to compare the bytes again + Cell previousIndexedKey = null; + do { + Cell nextIndexedKey = getNextIndexedKey(); + if (nextIndexedKey != null && nextIndexedKey != KeyValueScanner.NO_NEXT_INDEXED_KEY && + (nextIndexedKey == previousIndexedKey || + matcher.compareKeyForNextColumn(nextIndexedKey, cell) >= 0)) { + this.heap.next(); + ++kvsScanned; + previousIndexedKey = nextIndexedKey; + } else { + return false; + } + } while ((nextCell = this.heap.peek()) != null && CellUtil.matchingColumn(cell, nextCell)); + // We need this check because it may happen that the new scanner that we get + // during heap.next() is requiring reseek due of fake KV previously generated for + // ROWCOL bloom filter optimization. See HBASE-19863 for more details + if (nextCell != null && matcher.compareKeyForNextColumn(nextCell, cell) < 0) { + return false; + } + return true; + } + /** * @param flushed indicates if there was a flush * @return true if top of heap has changed (and KeyValueHeap has to try the diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java index 0d623ce018..824e68dce7 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java @@ -1571,6 +1571,35 @@ public class HBaseTestingUtility extends HBaseCommonTestingUtility { return new HTable(c, desc.getTableName()); } + /** + * Create a table. + * @param htd table descriptor + * @param families array of column families + * @param splitKeys array of split keys + * @param type Bloom type + * @param blockSize block size + * @param c Configuration to use + * @return An HTable instance for the created table. + * @throws IOException if getAdmin or createTable fails + */ + public HTable createTable(HTableDescriptor htd, byte[][] families, byte[][] splitKeys, + BloomType type, int blockSize, Configuration c) throws IOException { + for (byte[] family : families) { + HColumnDescriptor hcd = new HColumnDescriptor(family); + // Disable blooms (they are on by default as of 0.95) but we disable them here because + // tests have hard coded counts of what to expect in block cache, etc., and blooms being + // on is interfering. + hcd.setBloomFilterType(type); + hcd.setBlocksize(blockSize); + htd.addFamily(hcd); + } + getHBaseAdmin().createTable(htd, splitKeys); + // HBaseAdmin only waits for regions to appear in hbase:meta we should wait until they are + // assigned + waitUntilAllRegionsAssigned(htd.getTableName()); + return (HTable) getConnection().getTable(htd.getTableName()); + } + /** * Create a table. * @param tableName diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestIsDeleteFailure.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestIsDeleteFailure.java new file mode 100644 index 0000000000..771d1557fe --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestIsDeleteFailure.java @@ -0,0 +1,154 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import java.util.ArrayList; +import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.Mutation; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.filter.BinaryComparator; +import org.apache.hadoop.hbase.filter.CompareFilter; +import org.apache.hadoop.hbase.filter.SingleColumnValueFilter; +import org.apache.hadoop.hbase.testclassification.FilterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.testclassification.RegionServerTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.TestName; + +/** + * Test failure in ScanDeleteTracker.isDeleted when ROWCOL bloom filter + * is used during a scan with a filter. + */ +@Category({ RegionServerTests.class, FilterTests.class, MediumTests.class }) +public class TestIsDeleteFailure { + private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + + @Rule public TestName name = new TestName(); + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + TEST_UTIL.getConfiguration().setInt("hbase.regionserver.msginterval", 100); + TEST_UTIL.getConfiguration().setInt("hbase.client.pause", 250); + TEST_UTIL.getConfiguration().setInt("hbase.client.retries.number", 2); + TEST_UTIL.getConfiguration().setBoolean("hbase.master.enabletable.roundrobin", true); + TEST_UTIL.startMiniCluster(1); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + TEST_UTIL.shutdownMiniCluster(); + } + + @Test + public void testIsDeleteFailure() throws Exception { + final HTableDescriptor table = new HTableDescriptor(TableName.valueOf(name.getMethodName())); + final byte[] family = Bytes.toBytes("0"); + final byte[] c1 = Bytes.toBytes("C01"); + final byte[] c2 = Bytes.toBytes("C02"); + final byte[] c3 = Bytes.toBytes("C03"); + final byte[] c4 = Bytes.toBytes("C04"); + final byte[] c5 = Bytes.toBytes("C05"); + final byte[] c6 = Bytes.toBytes("C07"); + final byte[] c7 = Bytes.toBytes("C07"); + final byte[] c8 = Bytes.toBytes("C08"); + final byte[] c9 = Bytes.toBytes("C09"); + final byte[] c10 = Bytes.toBytes("C10"); + final byte[] c11 = Bytes.toBytes("C11"); + final byte[] c12 = Bytes.toBytes("C12"); + final byte[] c13 = Bytes.toBytes("C13"); + final byte[] c14 = Bytes.toBytes("C14"); + final byte[] c15 = Bytes.toBytes("C15"); + + final byte[] val = Bytes.toBytes("foo"); + List fams = new ArrayList<>(1); + fams.add(family); + Table ht = TEST_UTIL + .createTable(table, fams.toArray(new byte[0][]), null, BloomType.ROWCOL, 10000, + new Configuration(TEST_UTIL.getConfiguration())); + List pending = new ArrayList(); + for (int i = 0; i < 1000; i++) { + byte[] row = Bytes.toBytes("key" + Integer.toString(i)); + Put put = new Put(row); + put.addColumn(family, c3, val); + put.addColumn(family, c4, val); + put.addColumn(family, c5, val); + put.addColumn(family, c6, val); + put.addColumn(family, c7, val); + put.addColumn(family, c8, val); + put.addColumn(family, c12, val); + put.addColumn(family, c13, val); + put.addColumn(family, c15, val); + pending.add(put); + Delete del = new Delete(row); + del.addColumns(family, c2); + del.addColumns(family, c9); + del.addColumns(family, c10); + del.addColumns(family, c14); + pending.add(del); + } + ht.batch(pending, new Object[pending.size()]); + TEST_UTIL.flush(); + TEST_UTIL.compact(true); + for (int i = 20; i < 300; i++) { + byte[] row = Bytes.toBytes("key" + Integer.toString(i)); + Put put = new Put(row); + put.addColumn(family, c3, val); + put.addColumn(family, c4, val); + put.addColumn(family, c5, val); + put.addColumn(family, c6, val); + put.addColumn(family, c7, val); + put.addColumn(family, c8, val); + put.addColumn(family, c12, val); + put.addColumn(family, c13, val); + put.addColumn(family, c15, val); + pending.add(put); + Delete del = new Delete(row); + del.addColumns(family, c2); + del.addColumns(family, c9); + del.addColumns(family, c10); + del.addColumns(family, c14); + pending.add(del); + } + ht.batch(pending, new Object[pending.size()]); + TEST_UTIL.flush(); + + Scan scan = new Scan(); + scan.addColumn(family, c9); + scan.addColumn(family, c15); + SingleColumnValueFilter filter = + new SingleColumnValueFilter(family, c15, CompareFilter.CompareOp.EQUAL, + new BinaryComparator(c15)); + scan.setFilter(filter); + //Trigger the scan for not existing row, so it will scan over all rows + for (Result result : ht.getScanner(scan)) { + result.advance(); + } + ht.close(); + } +} \ No newline at end of file