Index: src/test/org/apache/hadoop/hbase/TestCompare.java =================================================================== --- src/test/org/apache/hadoop/hbase/TestCompare.java (revision 680917) +++ src/test/org/apache/hadoop/hbase/TestCompare.java (working copy) @@ -42,6 +42,15 @@ assertTrue(past.compareTo(now) > 0); assertTrue(now.compareTo(now) == 0); assertTrue(future.compareTo(now) < 0); + // Check that empty column comes before one with a column + HStoreKey nocolumn = new HStoreKey(a, timestamp); + HStoreKey withcolumn = new HStoreKey(a, a, timestamp); + assertTrue(nocolumn.compareTo(withcolumn) < 0); + // Check that empty column comes and LATEST comes before one with a column + // and old timestamp. + nocolumn = new HStoreKey(a, HConstants.LATEST_TIMESTAMP); + withcolumn = new HStoreKey(a, a, timestamp); + assertTrue(nocolumn.compareTo(withcolumn) < 0); } /** Index: src/test/org/apache/hadoop/hbase/regionserver/TestGet2.java =================================================================== --- src/test/org/apache/hadoop/hbase/regionserver/TestGet2.java (revision 680917) +++ src/test/org/apache/hadoop/hbase/regionserver/TestGet2.java (working copy) @@ -21,20 +21,19 @@ import java.io.IOException; import java.util.Map; -import java.util.HashSet; +import java.util.Set; import java.util.TreeMap; -import java.util.Set; import java.util.TreeSet; import org.apache.hadoop.dfs.MiniDFSCluster; -import org.apache.hadoop.hbase.filter.StopRowFilter; -import org.apache.hadoop.hbase.filter.WhileMatchRowFilter; import org.apache.hadoop.hbase.HBaseTestCase; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HStoreKey; import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.filter.StopRowFilter; +import org.apache.hadoop.hbase.filter.WhileMatchRowFilter; +import org.apache.hadoop.hbase.io.BatchUpdate; import org.apache.hadoop.hbase.io.Cell; -import org.apache.hadoop.hbase.io.BatchUpdate; import org.apache.hadoop.hbase.util.Bytes; /** @@ -47,13 +46,115 @@ @Override protected void setUp() throws Exception { super.setUp(); - this.miniHdfs = new MiniDFSCluster(this.conf, 1, true, null); + // REMOVE this.miniHdfs = new MiniDFSCluster(this.conf, 1, true, null); // Set the hbase.rootdir to be the home directory in mini dfs. - this.conf.set(HConstants.HBASE_DIR, - this.miniHdfs.getFileSystem().getHomeDirectory().toString()); + // REMOVE this.conf.set(HConstants.HBASE_DIR, + // this.miniHdfs.getFileSystem().getHomeDirectory().toString()); } /** + * Test file of multiple deletes and with deletes as final key. + * @see HBASE-751 + */ + public void testGetClosestRowBefore3() throws IOException{ + HRegion region = null; + BatchUpdate batchUpdate = null; + try { + HTableDescriptor htd = createTableDescriptor(getName()); + region = createNewHRegion(htd, null, null); + + // set up some test data + String t10 = "010"; + String t20 = "020"; + String t30 = "030"; + String t31 = "031"; + + batchUpdate = new BatchUpdate(t10); + batchUpdate.put(COLUMNS[0], t10.getBytes()); + region.batchUpdate(batchUpdate); + + batchUpdate = new BatchUpdate(t20); + batchUpdate.put(COLUMNS[0], t20.getBytes()); + region.batchUpdate(batchUpdate); + + batchUpdate = new BatchUpdate(t20); + batchUpdate.delete(COLUMNS[0]); + region.batchUpdate(batchUpdate); + + batchUpdate = new BatchUpdate(t30); + batchUpdate.put(COLUMNS[0], t30.getBytes()); + region.batchUpdate(batchUpdate); + + batchUpdate = new BatchUpdate(t30); + batchUpdate.delete(COLUMNS[0]); + region.batchUpdate(batchUpdate); + + Map results = + region.getClosestRowBefore(Bytes.toBytes(t30)); + assertEquals(t10, new String(results.get(COLUMNS[0]).getValue())); + results = region.getClosestRowBefore(Bytes.toBytes(t31)); + assertEquals(t10, new String(results.get(COLUMNS[0]).getValue())); + + region.flushcache(); + + // try finding "010" after flush + results = region.getClosestRowBefore(Bytes.toBytes(t30)); + assertEquals(t10, new String(results.get(COLUMNS[0]).getValue())); + results = region.getClosestRowBefore(Bytes.toBytes(t31)); + assertEquals(t10, new String(results.get(COLUMNS[0]).getValue())); + + // Put into a different column family. Should make it so I get + // and answer of t20. + batchUpdate = new BatchUpdate(t20); + batchUpdate.put(COLUMNS[1], t20.getBytes()); + region.batchUpdate(batchUpdate); + + results = region.getClosestRowBefore(Bytes.toBytes(t30)); + assertEquals(t20, new String(results.get(COLUMNS[1]).getValue())); + results = region.getClosestRowBefore(Bytes.toBytes(t31)); + assertEquals(t20, new String(results.get(COLUMNS[1]).getValue())); + region.flushcache(); + results = region.getClosestRowBefore(Bytes.toBytes(t30)); + assertEquals(t20, new String(results.get(COLUMNS[1]).getValue())); + results = region.getClosestRowBefore(Bytes.toBytes(t31)); + assertEquals(t20, new String(results.get(COLUMNS[1]).getValue())); + + // Now try combo of memcache and mapfiles. Delete the t20 COLUMS[1] + // in memory; make sure we get back t10 again. + batchUpdate = new BatchUpdate(t20); + batchUpdate.delete(COLUMNS[1]); + region.batchUpdate(batchUpdate); + batchUpdate = new BatchUpdate(t20); + batchUpdate.delete(COLUMNS[1]); + region.batchUpdate(batchUpdate); + batchUpdate = new BatchUpdate(t20); + batchUpdate.delete(COLUMNS[1]); + region.batchUpdate(batchUpdate); + results = region.getClosestRowBefore(Bytes.toBytes(t30)); + assertEquals(t10, new String(results.get(COLUMNS[0]).getValue())); + results = region.getClosestRowBefore(Bytes.toBytes(t31)); + assertEquals(t10, new String(results.get(COLUMNS[0]).getValue())); + + + // Ask for a value off the end of the file. Should return t10. + results = region.getClosestRowBefore(Bytes.toBytes(t31)); + assertEquals(t10, new String(results.get(COLUMNS[0]).getValue())); + region.flushcache(); + results = region.getClosestRowBefore(Bytes.toBytes(t31)); + assertEquals(t10, new String(results.get(COLUMNS[0]).getValue())); + } finally { + if (region != null) { + try { + region.close(); + } catch (Exception e) { + e.printStackTrace(); + } + region.getLog().closeAndDelete(); + } + } + } + + /** * Tests for HADOOP-2161. * @throws Exception */ Index: src/test/org/apache/hadoop/hbase/regionserver/TestRegionServerExit.java =================================================================== --- src/test/org/apache/hadoop/hbase/regionserver/TestRegionServerExit.java (revision 0) +++ src/test/org/apache/hadoop/hbase/regionserver/TestRegionServerExit.java (revision 0) @@ -0,0 +1,207 @@ +/** + * Copyright 2007 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import java.io.IOException; +import java.util.Collection; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.HBaseClusterTestCase; +import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.LocalHBaseCluster; +import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Scanner; +import org.apache.hadoop.hbase.io.BatchUpdate; +import org.apache.hadoop.hbase.io.RowResult; +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Tests region server failover when a region server exits both cleanly and + * when it aborts. + */ +public class TestRegionServerExit extends HBaseClusterTestCase { + final Log LOG = LogFactory.getLog(this.getClass().getName()); + HTable table; + + /** constructor */ + public TestRegionServerExit() { + super(2); + conf.setInt("ipc.client.connect.max.retries", 5); // reduce ipc retries + conf.setInt("ipc.client.timeout", 10000); // and ipc timeout + conf.setInt("hbase.client.pause", 10000); // increase client timeout + conf.setInt("hbase.client.retries.number", 10); // increase HBase retries + } + + /** + * Test abort of region server. + * @throws IOException + */ + public void testAbort() throws IOException { + // When the META table can be opened, the region servers are running + new HTable(conf, HConstants.META_TABLE_NAME); + // Create table and add a row. + final String tableName = getName(); + byte [] row = createTableAndAddRow(tableName); + // Start up a new region server to take over serving of root and meta + // after we shut down the current meta/root host. + this.cluster.startRegionServer(); + // Now abort the meta region server and wait for it to go down and come back + stopOrAbortMetaRegionServer(true); + // Verify that everything is back up. + LOG.info("Starting up the verification thread for " + getName()); + Thread t = startVerificationThread(tableName, row); + t.start(); + threadDumpingJoin(t); + } + + /** + * Test abort of region server. + * Test is flakey up on hudson. Needs work. + * @throws IOException + */ + public void testCleanExit() throws IOException { + // When the META table can be opened, the region servers are running + new HTable(this.conf, HConstants.META_TABLE_NAME); + // Create table and add a row. + final String tableName = getName(); + byte [] row = createTableAndAddRow(tableName); + // Start up a new region server to take over serving of root and meta + // after we shut down the current meta/root host. + this.cluster.startRegionServer(); + // Now abort the meta region server and wait for it to go down and come back + stopOrAbortMetaRegionServer(false); + // Verify that everything is back up. + LOG.info("Starting up the verification thread for " + getName()); + Thread t = startVerificationThread(tableName, row); + t.start(); + threadDumpingJoin(t); + } + + private byte [] createTableAndAddRow(final String tableName) + throws IOException { + HTableDescriptor desc = new HTableDescriptor(tableName); + desc.addFamily(new HColumnDescriptor(HConstants.COLUMN_FAMILY)); + HBaseAdmin admin = new HBaseAdmin(conf); + admin.createTable(desc); + // put some values in the table + this.table = new HTable(conf, tableName); + byte [] row = Bytes.toBytes("row1"); + BatchUpdate b = new BatchUpdate(row); + b.put(HConstants.COLUMN_FAMILY, Bytes.toBytes(tableName)); + table.commit(b); + return row; + } + + /* + * Stop the region server serving the meta region and wait for the meta region + * to get reassigned. This is always the most problematic case. + * + * @param abort set to true if region server should be aborted, if false it + * is just shut down. + */ + private void stopOrAbortMetaRegionServer(boolean abort) { + List regionThreads = + cluster.getRegionThreads(); + + int server = -1; + for (int i = 0; i < regionThreads.size() && server == -1; i++) { + HRegionServer s = regionThreads.get(i).getRegionServer(); + Collection regions = s.getOnlineRegions(); + for (HRegion r : regions) { + if (Bytes.equals(r.getTableDesc().getName(), + HConstants.META_TABLE_NAME)) { + server = i; + } + } + } + if (server == -1) { + LOG.fatal("could not find region server serving meta region"); + fail(); + } + if (abort) { + this.cluster.abortRegionServer(server); + + } else { + this.cluster.stopRegionServer(server); + } + LOG.info(this.cluster.waitOnRegionServer(server) + " has been " + + (abort ? "aborted" : "shut down")); + } + + /* + * Run verification in a thread so I can concurrently run a thread-dumper + * while we're waiting (because in this test sometimes the meta scanner + * looks to be be stuck). + * @param tableName Name of table to find. + * @param row Row we expect to find. + * @return Verification thread. Caller needs to calls start on it. + */ + private Thread startVerificationThread(final String tableName, + final byte [] row) { + Runnable runnable = new Runnable() { + public void run() { + try { + // Now try to open a scanner on the meta table. Should stall until + // meta server comes back up. + HTable t = new HTable(conf, HConstants.META_TABLE_NAME); + Scanner s = + t.getScanner(HConstants.COLUMN_FAMILY_ARRAY, + HConstants.EMPTY_START_ROW); + s.close(); + + } catch (IOException e) { + LOG.fatal("could not re-open meta table because", e); + fail(); + } + Scanner scanner = null; + try { + // Verify that the client can find the data after the region has moved + // to a different server + scanner = + table.getScanner(HConstants.COLUMN_FAMILY_ARRAY, + HConstants.EMPTY_START_ROW); + LOG.info("Obtained scanner " + scanner); + for (RowResult r : scanner) { + assertTrue(Bytes.equals(r.getRow(), row)); + assertEquals(1, r.size()); + byte[] bytes = r.get(HConstants.COLUMN_FAMILY).getValue(); + assertNotNull(bytes); + assertTrue(tableName.equals(Bytes.toString(bytes))); + } + LOG.info("Success!"); + } catch (Exception e) { + e.printStackTrace(); + fail(); + } finally { + if (scanner != null) { + LOG.info("Closing scanner " + scanner); + scanner.close(); + } + } + } + }; + return new Thread(runnable); + } +} \ No newline at end of file Index: src/test/org/apache/hadoop/hbase/regionserver/DisabledTestRegionServerExit.java =================================================================== --- src/test/org/apache/hadoop/hbase/regionserver/DisabledTestRegionServerExit.java (revision 680917) +++ src/test/org/apache/hadoop/hbase/regionserver/DisabledTestRegionServerExit.java (working copy) @@ -1,207 +0,0 @@ -/** - * Copyright 2007 The Apache Software Foundation - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hbase.regionserver; - -import java.io.IOException; -import java.util.Collection; -import java.util.List; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hbase.HBaseClusterTestCase; -import org.apache.hadoop.hbase.HColumnDescriptor; -import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HTableDescriptor; -import org.apache.hadoop.hbase.LocalHBaseCluster; -import org.apache.hadoop.hbase.client.HBaseAdmin; -import org.apache.hadoop.hbase.client.HTable; -import org.apache.hadoop.hbase.client.Scanner; -import org.apache.hadoop.hbase.io.BatchUpdate; -import org.apache.hadoop.hbase.io.RowResult; -import org.apache.hadoop.hbase.util.Bytes; - -/** - * Tests region server failover when a region server exits both cleanly and - * when it aborts. - */ -public class DisabledTestRegionServerExit extends HBaseClusterTestCase { - final Log LOG = LogFactory.getLog(this.getClass().getName()); - HTable table; - - /** constructor */ - public DisabledTestRegionServerExit() { - super(2); - conf.setInt("ipc.client.connect.max.retries", 5); // reduce ipc retries - conf.setInt("ipc.client.timeout", 10000); // and ipc timeout - conf.setInt("hbase.client.pause", 10000); // increase client timeout - conf.setInt("hbase.client.retries.number", 10); // increase HBase retries - } - - /** - * Test abort of region server. - * @throws IOException - */ - public void testAbort() throws IOException { - // When the META table can be opened, the region servers are running - new HTable(conf, HConstants.META_TABLE_NAME); - // Create table and add a row. - final String tableName = getName(); - byte [] row = createTableAndAddRow(tableName); - // Start up a new region server to take over serving of root and meta - // after we shut down the current meta/root host. - this.cluster.startRegionServer(); - // Now abort the meta region server and wait for it to go down and come back - stopOrAbortMetaRegionServer(true); - // Verify that everything is back up. - LOG.info("Starting up the verification thread for " + getName()); - Thread t = startVerificationThread(tableName, row); - t.start(); - threadDumpingJoin(t); - } - - /** - * Test abort of region server. - * Test is flakey up on hudson. Needs work. - * @throws IOException - */ - public void testCleanExit() throws IOException { - // When the META table can be opened, the region servers are running - new HTable(this.conf, HConstants.META_TABLE_NAME); - // Create table and add a row. - final String tableName = getName(); - byte [] row = createTableAndAddRow(tableName); - // Start up a new region server to take over serving of root and meta - // after we shut down the current meta/root host. - this.cluster.startRegionServer(); - // Now abort the meta region server and wait for it to go down and come back - stopOrAbortMetaRegionServer(false); - // Verify that everything is back up. - LOG.info("Starting up the verification thread for " + getName()); - Thread t = startVerificationThread(tableName, row); - t.start(); - threadDumpingJoin(t); - } - - private byte [] createTableAndAddRow(final String tableName) - throws IOException { - HTableDescriptor desc = new HTableDescriptor(tableName); - desc.addFamily(new HColumnDescriptor(HConstants.COLUMN_FAMILY)); - HBaseAdmin admin = new HBaseAdmin(conf); - admin.createTable(desc); - // put some values in the table - this.table = new HTable(conf, tableName); - byte [] row = Bytes.toBytes("row1"); - BatchUpdate b = new BatchUpdate(row); - b.put(HConstants.COLUMN_FAMILY, Bytes.toBytes(tableName)); - table.commit(b); - return row; - } - - /* - * Stop the region server serving the meta region and wait for the meta region - * to get reassigned. This is always the most problematic case. - * - * @param abort set to true if region server should be aborted, if false it - * is just shut down. - */ - private void stopOrAbortMetaRegionServer(boolean abort) { - List regionThreads = - cluster.getRegionThreads(); - - int server = -1; - for (int i = 0; i < regionThreads.size() && server == -1; i++) { - HRegionServer s = regionThreads.get(i).getRegionServer(); - Collection regions = s.getOnlineRegions(); - for (HRegion r : regions) { - if (Bytes.equals(r.getTableDesc().getName(), - HConstants.META_TABLE_NAME)) { - server = i; - } - } - } - if (server == -1) { - LOG.fatal("could not find region server serving meta region"); - fail(); - } - if (abort) { - this.cluster.abortRegionServer(server); - - } else { - this.cluster.stopRegionServer(server); - } - LOG.info(this.cluster.waitOnRegionServer(server) + " has been " + - (abort ? "aborted" : "shut down")); - } - - /* - * Run verification in a thread so I can concurrently run a thread-dumper - * while we're waiting (because in this test sometimes the meta scanner - * looks to be be stuck). - * @param tableName Name of table to find. - * @param row Row we expect to find. - * @return Verification thread. Caller needs to calls start on it. - */ - private Thread startVerificationThread(final String tableName, - final byte [] row) { - Runnable runnable = new Runnable() { - public void run() { - try { - // Now try to open a scanner on the meta table. Should stall until - // meta server comes back up. - HTable t = new HTable(conf, HConstants.META_TABLE_NAME); - Scanner s = - t.getScanner(HConstants.COLUMN_FAMILY_ARRAY, - HConstants.EMPTY_START_ROW); - s.close(); - - } catch (IOException e) { - LOG.fatal("could not re-open meta table because", e); - fail(); - } - Scanner scanner = null; - try { - // Verify that the client can find the data after the region has moved - // to a different server - scanner = - table.getScanner(HConstants.COLUMN_FAMILY_ARRAY, - HConstants.EMPTY_START_ROW); - LOG.info("Obtained scanner " + scanner); - for (RowResult r : scanner) { - assertTrue(Bytes.equals(r.getRow(), row)); - assertEquals(1, r.size()); - byte[] bytes = r.get(HConstants.COLUMN_FAMILY).getValue(); - assertNotNull(bytes); - assertTrue(tableName.equals(Bytes.toString(bytes))); - } - LOG.info("Success!"); - } catch (Exception e) { - e.printStackTrace(); - fail(); - } finally { - if (scanner != null) { - LOG.info("Closing scanner " + scanner); - scanner.close(); - } - } - } - }; - return new Thread(runnable); - } -} \ No newline at end of file Index: src/java/org/apache/hadoop/hbase/regionserver/Memcache.java =================================================================== --- src/java/org/apache/hadoop/hbase/regionserver/Memcache.java (revision 680917) +++ src/java/org/apache/hadoop/hbase/regionserver/Memcache.java (working copy) @@ -344,32 +344,32 @@ } private void internalGetRowKeyAtOrBefore(SortedMap map, - byte [] key, SortedMap candidateKeys) { - HStoreKey strippedKey = null; + byte [] key, SortedMap candidateKeys) { + // We want the earliest possible to start searching from. Start before + // the candidate key in case it turns out a delete came in later. + HStoreKey search_key = candidateKeys.isEmpty()? new HStoreKey(key): + new HStoreKey(candidateKeys.firstKey().getRow()); + List victims = new ArrayList(); + long now = System.currentTimeMillis(); - // we want the earliest possible to start searching from - HStoreKey search_key = candidateKeys.isEmpty() ? - new HStoreKey(key) : new HStoreKey(candidateKeys.firstKey().getRow()); - Iterator key_iterator = null; - HStoreKey found_key = null; - ArrayList victims = new ArrayList(); - long now = System.currentTimeMillis(); - // get all the entries that come equal or after our search key + // Get all the entries that come equal or after our search key SortedMap tailMap = map.tailMap(search_key); // if there are items in the tail map, there's either a direct match to // the search key, or a range of values between the first candidate key // and the ultimate search key (or the end of the cache) if (!tailMap.isEmpty() && - Bytes.compareTo(tailMap.firstKey().getRow(), key) <= 0) { - key_iterator = tailMap.keySet().iterator(); + Bytes.compareTo(tailMap.firstKey().getRow(), search_key.getRow()) <= 0) { + Iterator key_iterator = tailMap.keySet().iterator(); - // keep looking at cells as long as they are no greater than the + // Keep looking at cells as long as they are no greater than the // ultimate search key and there's still records left in the map. + HStoreKey deletedOrExpiredRow = null; + HStoreKey found_key = null; do { found_key = key_iterator.next(); - if (Bytes.compareTo(found_key.getRow(), key) <= 0) { - strippedKey = stripTimestamp(found_key); + if (Bytes.compareTo(found_key.getRow(), search_key.getRow()) <= 0) { + HStoreKey strippedKey = stripTimestamp(found_key); if (HLogEdit.isDeleted(tailMap.get(found_key))) { if (candidateKeys.containsKey(strippedKey)) { long bestCandidateTs = @@ -378,12 +378,18 @@ candidateKeys.remove(strippedKey); } } + if (deletedOrExpiredRow == null) { + deletedOrExpiredRow = strippedKey; + } } else { if (ttl == HConstants.FOREVER || now < found_key.getTimestamp() + ttl) { candidateKeys.put(strippedKey, new Long(found_key.getTimestamp())); } else { + if (deletedOrExpiredRow == null) { + deletedOrExpiredRow = strippedKey; + } victims.add(found_key); if (LOG.isDebugEnabled()) { LOG.debug(":" + found_key + ": expired, skipped"); @@ -391,96 +397,113 @@ } } } - } while (Bytes.compareTo(found_key.getRow(), key) <= 0 - && key_iterator.hasNext()); + } while (Bytes.compareTo(found_key.getRow(), search_key.getRow()) <= 0 && + key_iterator.hasNext()); + if (candidateKeys.isEmpty() && deletedOrExpiredRow != null) { + getRowKeyBefore(map, deletedOrExpiredRow, candidateKeys, victims, now); + } } else { - // the tail didn't contain any keys that matched our criteria, or was - // empty. examine all the keys that preceed our splitting point. - SortedMap headMap = map.headMap(search_key); - - // if we tried to create a headMap and got an empty map, then there are - // no keys at or before the search key, so we're done. - if (headMap.isEmpty()) { - return; - } - - // if there aren't any candidate keys at this point, we need to search - // backwards until we find at least one candidate or run out of headMap. - if (candidateKeys.isEmpty()) { - HStoreKey[] cells = - headMap.keySet().toArray(new HStoreKey[headMap.keySet().size()]); - - byte [] lastRowFound = null; - for(int i = cells.length - 1; i >= 0; i--) { - HStoreKey thisKey = cells[i]; - - // if the last row we found a candidate key for is different than - // the row of the current candidate, we can stop looking. - if (lastRowFound != null && - !Bytes.equals(lastRowFound, thisKey.getRow())) { - break; - } - - // if this isn't a delete, record it as a candidate key. also - // take note of the row of this candidate so that we'll know when - // we cross the row boundary into the previous row. - if (!HLogEdit.isDeleted(headMap.get(thisKey))) { - if (ttl == HConstants.FOREVER) { - lastRowFound = thisKey.getRow(); - candidateKeys.put(stripTimestamp(thisKey), - new Long(thisKey.getTimestamp())); - } else { - victims.add(found_key); - if (LOG.isDebugEnabled()) { - LOG.debug("internalGetRowKeyAtOrBefore: " + found_key + - ": expired, skipped"); - } + // The tail didn't contain any keys that matched our criteria, or was + // empty. Examine all the keys that preceed our splitting point. + getRowKeyBefore(map, search_key, candidateKeys, victims, now); + } + // Remove expired victims from the map. + for (HStoreKey victim: victims) { + map.remove(victim); + } + } + + /* + * Get row key that comes before passed search_key + * Use when we know search_key is not in the map and we need to search + * earlier in the cache. + * @param map + * @param search_key + * @param candidateKeys + * @param victims + */ + private void getRowKeyBefore(SortedMap map, + HStoreKey search_key, SortedMap candidateKeys, + List victims, final long now) { + SortedMap headMap = map.headMap(search_key); + // if we tried to create a headMap and got an empty map, then there are + // no keys at or before the search key, so we're done. + if (headMap.isEmpty()) { + return; + } + + // if there aren't any candidate keys at this point, we need to search + // backwards until we find at least one candidate or run out of headMap. + HStoreKey found_key = null; + if (candidateKeys.isEmpty()) { + HStoreKey[] cells = + headMap.keySet().toArray(new HStoreKey[headMap.keySet().size()]); + byte [] lastRowFound = null; + for (int i = cells.length - 1; i >= 0; i--) { + HStoreKey thisKey = cells[i]; + // if the last row we found a candidate key for is different than + // the row of the current candidate, we can stop looking -- if its + // not a delete record + boolean deleted = HLogEdit.isDeleted(headMap.get(thisKey)); + if (lastRowFound != null && + !Bytes.equals(lastRowFound, thisKey.getRow()) && !deleted) { + break; + } + + // if this isn't a delete, record it as a candidate key. also + // take note of the row of this candidate so that we'll know when + // we cross the row boundary into the previous row. + if (!deleted) { + if (ttl == HConstants.FOREVER) { + lastRowFound = thisKey.getRow(); + candidateKeys.put(stripTimestamp(thisKey), + new Long(thisKey.getTimestamp())); + } else { + victims.add(found_key); + if (LOG.isDebugEnabled()) { + LOG.debug("internalGetRowKeyAtOrBefore: " + found_key + + ": expired, skipped"); } } } - } else { - // if there are already some candidate keys, we only need to consider - // the very last row's worth of keys in the headMap, because any - // smaller acceptable candidate keys would have caused us to start - // our search earlier in the list, and we wouldn't be searching here. - SortedMap thisRowTailMap = - headMap.tailMap(new HStoreKey(headMap.lastKey().getRow())); - - key_iterator = thisRowTailMap.keySet().iterator(); - - do { - found_key = key_iterator.next(); - - if (HLogEdit.isDeleted(thisRowTailMap.get(found_key))) { - strippedKey = stripTimestamp(found_key); - if (candidateKeys.containsKey(strippedKey)) { - long bestCandidateTs = - candidateKeys.get(strippedKey).longValue(); - if (bestCandidateTs <= found_key.getTimestamp()) { - candidateKeys.remove(strippedKey); - } + } + } else { + // if there are already some candidate keys, we only need to consider + // the very last row's worth of keys in the headMap, because any + // smaller acceptable candidate keys would have caused us to start + // our search earlier in the list, and we wouldn't be searching here. + SortedMap thisRowTailMap = + headMap.tailMap(new HStoreKey(headMap.lastKey().getRow())); + Iterator key_iterator = thisRowTailMap.keySet().iterator(); + do { + found_key = key_iterator.next(); + + if (HLogEdit.isDeleted(thisRowTailMap.get(found_key))) { + HStoreKey strippedKey = stripTimestamp(found_key); + if (candidateKeys.containsKey(strippedKey)) { + long bestCandidateTs = + candidateKeys.get(strippedKey).longValue(); + if (bestCandidateTs <= found_key.getTimestamp()) { + candidateKeys.remove(strippedKey); } + } + } else { + if (ttl == HConstants.FOREVER || + now < found_key.getTimestamp() + ttl) { + candidateKeys.put(stripTimestamp(found_key), + Long.valueOf(found_key.getTimestamp())); } else { - if (ttl == HConstants.FOREVER || - now < found_key.getTimestamp() + ttl) { - candidateKeys.put(stripTimestamp(found_key), - Long.valueOf(found_key.getTimestamp())); - } else { - victims.add(found_key); - if (LOG.isDebugEnabled()) { - LOG.debug("internalGetRowKeyAtOrBefore: " + found_key + - ": expired, skipped"); - } + victims.add(found_key); + if (LOG.isDebugEnabled()) { + LOG.debug("internalGetRowKeyAtOrBefore: " + found_key + + ": expired, skipped"); } } - } while (key_iterator.hasNext()); - } + } + } while (key_iterator.hasNext()); } - // Remove expired victims from the map. - for (HStoreKey victim: victims) - map.remove(victim); } - + static HStoreKey stripTimestamp(HStoreKey key) { return new HStoreKey(key.getRow(), key.getColumn()); } Index: src/java/org/apache/hadoop/hbase/regionserver/HStore.java =================================================================== --- src/java/org/apache/hadoop/hbase/regionserver/HStore.java (revision 680917) +++ src/java/org/apache/hadoop/hbase/regionserver/HStore.java (working copy) @@ -105,8 +105,7 @@ Collections.synchronizedSortedMap(new TreeMap()); /* - * Sorted Map of readers keyed by sequence id (Most recent should be last in - * in list). + * Sorted Map of readers keyed by sequence id (Most recent is last in list). */ private final SortedMap readers = new TreeMap(); @@ -761,7 +760,8 @@ return checkSplit(); } - // HBASE-745, preparing all store file size for incremental compacting selection. + // HBASE-745, preparing all store file size for incremental compacting + // selection. int countOfFiles = filesToCompact.size(); long totalSize = 0; long[] fileSizes = new long[countOfFiles]; @@ -780,7 +780,7 @@ if (!force && !hasReferences(filesToCompact)) { // Here we select files for incremental compaction. // The rule is: if the largest(oldest) one is more than twice the - // size of the second, skip the largest, and continue to next..., + // size of the second, skip the largest, and continue to next..., // until we meet the compactionThreshold limit. for (point = 0; point < compactionThreshold - 1; point++) { if (fileSizes[point] < fileSizes[point + 1] * 2) { @@ -791,7 +791,8 @@ filesToCompact = new ArrayList(filesToCompact.subList(point, countOfFiles)); if (LOG.isDebugEnabled()) { - LOG.debug("Compaction size " + totalSize + ", skipped " + point + + LOG.debug("Compaction size of " + this.getFamily().getNameAsString() + + StringUtils.humanReadableInt(totalSize) + ", skipped " + point + ", " + skipped); } } @@ -855,7 +856,7 @@ } return checkSplit(); } - + /* * Compact a list of MapFile.Readers into MapFile.Writer. * @@ -1219,7 +1220,10 @@ } while(map.next(readkey, readval)); } } - + + /** + * @return Array of readers ordered oldest to newest. + */ MapFile.Reader [] getReaders() { return this.readers.values(). toArray(new MapFile.Reader[this.readers.size()]); @@ -1427,7 +1431,10 @@ * Find the key that matches row exactly, or the one that immediately * preceeds it. WARNING: Only use this method on a table where writes occur * with stricly increasing timestamps. This method assumes this pattern of - * writes in order to make it reasonably performant. + * writes in order to make it reasonably performant. + * @param row + * @return Found row + * @throws IOException */ byte [] getRowKeyAtOrBefore(final byte [] row) throws IOException{ @@ -1436,14 +1443,14 @@ // deletes found all over the place as we go along before finally reading // the best key out of it at the end. SortedMap candidateKeys = new TreeMap(); - // Obtain read lock this.lock.readLock().lock(); try { - // Process each store file + // Process each store file. Run through from oldest to newest so deletes + // have chance to overshadow deleted cells MapFile.Reader[] maparray = getReaders(); - for (int i = maparray.length - 1; i >= 0; i--) { - // update the candidate keys from the current map file + for (int i = 0; i < maparray.length; i++) { + // Update the candidate keys from the current map file rowAtOrBeforeFromMapFile(maparray[i], row, candidateKeys); } @@ -1457,70 +1464,98 @@ } } - /** + /* * Check an individual MapFile for the row at or before a given key * and timestamp + * @param map + * @param row + * @param candidateKeys + * @throws IOException */ private void rowAtOrBeforeFromMapFile(MapFile.Reader map, final byte [] row, SortedMap candidateKeys) throws IOException { - ImmutableBytesWritable readval = new ImmutableBytesWritable(); - HStoreKey readkey = new HStoreKey(); - + HStoreKey startKey = new HStoreKey(); + ImmutableBytesWritable startValue = new ImmutableBytesWritable(); synchronized(map) { - // don't bother with the rest of this if the file is empty + // Don't bother with the rest of this if the file is empty map.reset(); - if (!map.next(readkey, readval)) { + if (!map.next(startKey, startValue)) { return; } - + // If start row for this file is beyond passed in row, return; nothing + // in here is of use to us. + if (Bytes.compareTo(startKey.getRow(), row) > 0) { + return; + } long now = System.currentTimeMillis(); - - // if there aren't any candidate keys yet, we'll do some things slightly - // different + // if there aren't any candidate keys yet, we'll do some things different if (candidateKeys.isEmpty()) { - rowKeyFromMapFileEmptyKeys(map, row, candidateKeys, now); + rowAtOrBeforeCandidate(startKey, map, row, candidateKeys, now); } else { - rowKeyAtOrBeforeExistingCandKeys(map, row, candidateKeys, now); + rowAtOrBeforeWithCandidates(startKey, map, row, candidateKeys, + now); } } } - private void rowKeyFromMapFileEmptyKeys(MapFile.Reader map, byte[] row, - SortedMap candidateKeys, long now) + /* Find a candidate for row that is at or before passed row in passed + * mapfile. + * @param map + * @param row + * @param candidateKeys + * @param now + * @throws IOException + */ + private void rowAtOrBeforeCandidate(final HStoreKey startKey, + final MapFile.Reader map, final byte[] row, + final SortedMap candidateKeys, final long now) throws IOException { - - HStoreKey searchKey = new HStoreKey(row); + // if the row we're looking for is past the end of this mapfile, set the + // search key to be the last key. If its a deleted key, then we'll back + // up to the row before and return that. + HStoreKey finalKey = getFinalKey(map); + HStoreKey searchKey = null; + if (Bytes.compareTo(finalKey.getRow(), row) <= 0) { + searchKey = finalKey; + } else { + searchKey = new HStoreKey(row); + if (searchKey.compareTo(startKey) < 0) { + searchKey = startKey; + } + new HStoreKey(row); + } + rowAtOrBeforeCandidate(map, searchKey, candidateKeys, now); + } + + /* + * Find candidate for row at or before passed searchKey. + * @param map + * @param sk + * @param candidateKeys + * @param now + * @throws IOException + */ + private void rowAtOrBeforeCandidate(final MapFile.Reader map, + final HStoreKey sk, final SortedMap candidateKeys, + final long now) + throws IOException { + HStoreKey searchKey = sk; ImmutableBytesWritable readval = new ImmutableBytesWritable(); HStoreKey readkey = new HStoreKey(); - - // if the row we're looking for is past the end of this mapfile, just - // save time and add the last key to the candidates. - HStoreKey finalKey = new HStoreKey(); - map.finalKey(finalKey); - if (Bytes.compareTo(finalKey.getRow(), row) < 0) { - candidateKeys.put(stripTimestamp(finalKey), - new Long(finalKey.getTimestamp())); - return; - } - - HStoreKey deletedOrExpiredRow = null; - boolean foundCandidate = false; - while (!foundCandidate) { + for (boolean foundCandidate = false; !foundCandidate;) { // seek to the exact row, or the one that would be immediately before it readkey = (HStoreKey)map.getClosest(searchKey, readval, true); - if (readkey == null) { - // didn't find anything that would match, so return - return; + break; } - + HStoreKey deletedOrExpiredRow = null; do { // if we have an exact match on row, and it's not a delete, save this // as a candidate key - if (Bytes.equals(readkey.getRow(), row)) { + if (Bytes.equals(readkey.getRow(), searchKey.getRow())) { if (!HLogEdit.isDeleted(readval.get())) { - if (ttl == HConstants.FOREVER || + if (ttl == HConstants.FOREVER || now < readkey.getTimestamp() + ttl) { candidateKeys.put(stripTimestamp(readkey), new Long(readkey.getTimestamp())); @@ -1532,8 +1567,10 @@ ": expired, skipped"); } } - deletedOrExpiredRow = stripTimestamp(readkey); - } else if (Bytes.compareTo(readkey.getRow(), row) > 0 ) { + if (deletedOrExpiredRow == null) { + deletedOrExpiredRow = stripTimestamp(readkey); + } + } else if (Bytes.compareTo(readkey.getRow(), searchKey.getRow()) > 0) { // if the row key we just read is beyond the key we're searching for, // then we're done. return. break; @@ -1554,70 +1591,54 @@ ": expired, skipped"); } } - deletedOrExpiredRow = stripTimestamp(readkey); + if (deletedOrExpiredRow == null) { + deletedOrExpiredRow = stripTimestamp(readkey); + } } } while(map.next(readkey, readval)); // If we get here and have no candidates but we did find a deleted or // expired candidate, we need to look at the key before that - if (!foundCandidate && deletedOrExpiredRow != null) { - searchKey = deletedOrExpiredRow; - deletedOrExpiredRow = null; - + // Rong-en suggests that BeforeThisStoreKey is not really needed here + // since 'deletedOrExpiredRow' is using LATEST_TIMESTAMP and that + // shouldn't ever make it into a mapfile. Leaving to be safe. + searchKey = new BeforeThisStoreKey(deletedOrExpiredRow); } else { // No candidates and no deleted or expired candidates. Give up. break; } } - - // arriving here just means that we consumed the whole rest of the map + + // Arriving here just means that we consumed the whole rest of the map // without going "past" the key we're searching for. we can just fall // through here. } - - private void rowKeyAtOrBeforeExistingCandKeys(MapFile.Reader map, byte[] row, - SortedMap candidateKeys, long now) + private void rowAtOrBeforeWithCandidates(final HStoreKey startKey, + final MapFile.Reader map, final byte[] row, + final SortedMap candidateKeys, final long now) throws IOException { - - HStoreKey strippedKey = null; ImmutableBytesWritable readval = new ImmutableBytesWritable(); HStoreKey readkey = new HStoreKey(); + HStoreKey strippedKey = null; - // if there are already candidate keys, we need to start our search // at the earliest possible key so that we can discover any possible - // deletes for keys between the start and the search key. + // deletes for keys between the start and the search key. Back up to start + // of the row in case there are deletes for this candidate in this mapfile + // BUT do not backup before the first key in the mapfile else getClosest + // will return null. HStoreKey searchKey = new HStoreKey(candidateKeys.firstKey().getRow()); - - // if the row we're looking for is past the end of this mapfile, just - // save time and add the last key to the candidates. - HStoreKey finalKey = new HStoreKey(); - map.finalKey(finalKey); - if (Bytes.compareTo(finalKey.getRow(), searchKey.getRow()) < 0) { - strippedKey = stripTimestamp(finalKey); - - // if the candidate keys has a cell like this one already, - // then we might want to update the timestamp we're using on it - if (candidateKeys.containsKey(strippedKey)) { - long bestCandidateTs = - candidateKeys.get(strippedKey).longValue(); - if (bestCandidateTs < finalKey.getTimestamp()) { - candidateKeys.put(strippedKey, new Long(finalKey.getTimestamp())); - } - } else { - // otherwise, this is a new key, so put it up as a candidate - candidateKeys.put(strippedKey, new Long(finalKey.getTimestamp())); - } - return; + if (searchKey.compareTo(startKey) < 0) { + searchKey = startKey; } - - // seek to the exact row, or the one that would be immediately before it + + // Seek to the exact row, or the one that would be immediately before it readkey = (HStoreKey)map.getClosest(searchKey, readval, true); - if (readkey == null) { - // didn't find anything that would match, so return + // If null, we are at the start or end of the file. + // Didn't find anything that would match, so return return; } @@ -1638,7 +1659,7 @@ } } } else { - // if the candidate keys contain any that might match by timestamp, + // If the candidate keys contain any that might match by timestamp, // then check for a match and remove it if it's too young to // survive the delete if (candidateKeys.containsKey(strippedKey)) { @@ -1652,10 +1673,9 @@ } else if (Bytes.compareTo(readkey.getRow(), row) > 0 ) { // if the row key we just read is beyond the key we're searching for, // then we're done. return. - return; + break; } else { strippedKey = stripTimestamp(readkey); - // so, the row key doesn't match, but we haven't gone past the row // we're seeking yet, so this row is a candidate for closest // (assuming that it isn't a delete). @@ -1685,6 +1705,17 @@ } while(map.next(readkey, readval)); } + /* + * @param mf MapFile to dig in. + * @return Final key from passed mf + * @throws IOException + */ + private HStoreKey getFinalKey(final MapFile.Reader mf) throws IOException { + HStoreKey finalKey = new HStoreKey(); + mf.finalKey(finalKey); + return finalKey; + } + static HStoreKey stripTimestamp(HStoreKey key) { return new HStoreKey(key.getRow(), key.getColumn()); } Index: src/java/org/apache/hadoop/hbase/regionserver/BeforeThisStoreKey.java =================================================================== --- src/java/org/apache/hadoop/hbase/regionserver/BeforeThisStoreKey.java (revision 0) +++ src/java/org/apache/hadoop/hbase/regionserver/BeforeThisStoreKey.java (revision 0) @@ -0,0 +1,119 @@ +/** + * Copyright 2008 The Apache Software Foundation + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.hbase.HStoreKey; + +/** + * Pass this class into {@link org.apache.hadoop.io.MapFile}.getClosest when + * searching for the key that comes BEFORE this one but NOT this one. THis + * class will return > 0 when asked to compare against itself rather than 0. + * This is a hack for case where getClosest returns a deleted key and we want + * to get the previous. Can't unless use use this class; it'll just keep + * returning us the deleted key (getClosest gets exact or nearest before when + * you pass true argument). TODO: Throw this class away when MapFile has + * a real 'previous' method. See HBASE-751. + */ +public class BeforeThisStoreKey extends HStoreKey { + private final HStoreKey beforeThisKey; + + /** + * @param beforeThisKey + */ + public BeforeThisStoreKey(final HStoreKey beforeThisKey) { + super(); + this.beforeThisKey = beforeThisKey; + } + + @Override + public int compareTo(Object o) { + int result = this.beforeThisKey.compareTo(o); + return result == 0? 1: result; + } + + @Override + public boolean equals(@SuppressWarnings("unused") Object obj) { + return false; + } + + public byte[] getColumn() { + return this.beforeThisKey.getColumn(); + } + + public byte[] getRow() { + return this.beforeThisKey.getRow(); + } + + public long getSize() { + return this.beforeThisKey.getSize(); + } + + public long getTimestamp() { + return this.beforeThisKey.getTimestamp(); + } + + public int hashCode() { + return this.beforeThisKey.hashCode(); + } + + public boolean matchesRowCol(HStoreKey other) { + return this.beforeThisKey.matchesRowCol(other); + } + + public boolean matchesRowFamily(HStoreKey that) { + return this.beforeThisKey.matchesRowFamily(that); + } + + public boolean matchesWithoutColumn(HStoreKey other) { + return this.beforeThisKey.matchesWithoutColumn(other); + } + + public void readFields(DataInput in) throws IOException { + this.beforeThisKey.readFields(in); + } + + public void set(HStoreKey k) { + this.beforeThisKey.set(k); + } + + public void setColumn(byte[] c) { + this.beforeThisKey.setColumn(c); + } + + public void setRow(byte[] newrow) { + this.beforeThisKey.setRow(newrow); + } + + public void setVersion(long timestamp) { + this.beforeThisKey.setVersion(timestamp); + } + + public String toString() { + return this.beforeThisKey.toString(); + } + + public void write(DataOutput out) throws IOException { + this.beforeThisKey.write(out); + } +} \ No newline at end of file Index: src/java/org/apache/hadoop/hbase/regionserver/CompactSplitThread.java =================================================================== --- src/java/org/apache/hadoop/hbase/regionserver/CompactSplitThread.java (revision 680917) +++ src/java/org/apache/hadoop/hbase/regionserver/CompactSplitThread.java (working copy) @@ -92,7 +92,7 @@ } catch (InterruptedException ex) { continue; } catch (IOException ex) { - LOG.error("Compaction failed" + + LOG.error("Compaction/Split failed" + (r != null ? (" for region " + Bytes.toString(r.getRegionName())) : ""), RemoteExceptionHandler.checkIOException(ex)); if (!server.checkFileSystem()) { Index: src/java/org/apache/hadoop/hbase/io/BatchUpdate.java =================================================================== --- src/java/org/apache/hadoop/hbase/io/BatchUpdate.java (revision 680917) +++ src/java/org/apache/hadoop/hbase/io/BatchUpdate.java (working copy) @@ -49,7 +49,7 @@ private long timestamp = HConstants.LATEST_TIMESTAMP; /** - * Default constructor used serializing. + * Default constructor used serializing. Do not use directly. */ public BatchUpdate() { this ((byte [])null); Index: src/java/org/apache/hadoop/hbase/client/HTable.java =================================================================== --- src/java/org/apache/hadoop/hbase/client/HTable.java (revision 680917) +++ src/java/org/apache/hadoop/hbase/client/HTable.java (working copy) @@ -1182,7 +1182,7 @@ */ public synchronized void commit(final List batchUpdates) throws IOException { - for(BatchUpdate batchUpdate : batchUpdates) + for (BatchUpdate batchUpdate : batchUpdates) commit(batchUpdate); }