From a889ac4c1d525d1e2cda20e70d363236f9e38dd0 Mon Sep 17 00:00:00 2001 From: stack Date: Sun, 17 May 2020 18:02:54 -0700 Subject: [PATCH] HBASE-17756 We should have better introspection of HFiles TODO: make it optional. hbase-server/pom.xml Add dependency on data sketches. hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java Print out key and value quantiles. hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterImpl.java Accumulate sketches. Use sketches for counts and average key and value size. --- hbase-server/pom.xml | 4 + .../hbase/io/hfile/HFilePrettyPrinter.java | 104 +++++++++++++----- .../hbase/io/hfile/HFileWriterImpl.java | 52 +++++---- .../io/hfile/TestHFilePrettyPrinter.java | 16 ++- hbase-shaded/hbase-shaded-mapreduce/pom.xml | 4 + pom.xml | 10 ++ 6 files changed, 141 insertions(+), 49 deletions(-) diff --git a/hbase-server/pom.xml b/hbase-server/pom.xml index 5fab6c8274..8fea205cb7 100644 --- a/hbase-server/pom.xml +++ b/hbase-server/pom.xml @@ -447,6 +447,10 @@ com.lmax disruptor + + org.apache.datasketches + datasketches-java + org.hamcrest diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java index 57db2068ec..57ab8a96eb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hbase.io.hfile; import static com.codahale.metrics.MetricRegistry.name; - import com.codahale.metrics.ConsoleReporter; import com.codahale.metrics.Counter; import com.codahale.metrics.Gauge; @@ -34,8 +33,10 @@ import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.IOException; import java.io.PrintStream; +import java.math.BigDecimal; import java.text.DateFormat; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashSet; @@ -46,7 +47,11 @@ import java.util.Set; import java.util.SortedMap; import java.util.TimeZone; import java.util.concurrent.TimeUnit; +import java.util.function.DoubleSupplier; +import java.util.stream.DoubleStream; import org.apache.commons.lang3.StringUtils; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.quantiles.DoublesSketch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; @@ -64,6 +69,7 @@ import org.apache.hadoop.hbase.PrivateCellUtil; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.Tag; import org.apache.hadoop.hbase.io.FSDataInputStreamWrapper; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mob.MobUtils; import org.apache.hadoop.hbase.regionserver.HStoreFile; import org.apache.hadoop.hbase.regionserver.TimeRangeTracker; @@ -79,7 +85,6 @@ import org.apache.yetus.audience.InterfaceAudience; import org.apache.yetus.audience.InterfaceStability; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLine; import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLineParser; import org.apache.hbase.thirdparty.org.apache.commons.cli.HelpFormatter; @@ -127,6 +132,26 @@ public class HFilePrettyPrinter extends Configured implements Tool { private static final String FOUR_SPACES = " "; + /** + * Supplier that goes up in 0.01 increments from 0. + */ + private static final DoubleSupplier IN_POINT_1_INC = new DoubleSupplier() { + private BigDecimal accumulator = new BigDecimal(0); + private final BigDecimal pointOhOne = new BigDecimal(0.01); + + @Override + public double getAsDouble() { + double d = this.accumulator.doubleValue(); + this.accumulator = this.accumulator.add(pointOhOne); + return d; + } + }; + + /** + * Make an array from 0 to 1 in 0.01 increments. + */ + static double [] NORMALIZED_RANKS = DoubleStream.generate(IN_POINT_1_INC).limit(100).toArray(); + public HFilePrettyPrinter() { super(); init(); @@ -167,7 +192,7 @@ public class HFilePrettyPrinter extends Configured implements Tool { this.err = err; } - public boolean parseOptions(String args[]) throws ParseException, + public boolean parseOptions(String [] args) throws ParseException, IOException { if (args.length == 0) { HelpFormatter formatter = new HelpFormatter(); @@ -211,18 +236,20 @@ public class HFilePrettyPrinter extends Configured implements Tool { Path tableDir = CommonFSUtils.getTableDir(rootDir, TableName.valueOf(hri[0])); String enc = HRegionInfo.encodeRegionName(rn); Path regionDir = new Path(tableDir, enc); - if (verbose) + if (verbose) { out.println("region dir -> " + regionDir); + } List regionFiles = HFile.getStoreFiles(FileSystem.get(getConf()), regionDir); - if (verbose) - out.println("Number of region files found -> " - + regionFiles.size()); + if (verbose) { + out.println("Number of region files found -> " + regionFiles.size()); + } if (verbose) { int i = 1; for (Path p : regionFiles) { - if (verbose) + if (verbose) { out.println("Found file[" + i++ + "] -> " + p); + } } } files.addAll(regionFiles); @@ -321,16 +348,12 @@ public class HFilePrettyPrinter extends Configured implements Tool { // scan over file and read key/value's and check if requested HFileScanner scanner = reader.getScanner(false, false, false); fileStats = new KeyValueStatsCollector(); - boolean shouldScanKeysValues = false; - if (this.isSeekToRow) { - // seek to the first kv on this row - shouldScanKeysValues = - (scanner.seekTo(PrivateCellUtil.createFirstOnRow(this.row)) != -1); - } else { - shouldScanKeysValues = scanner.seekTo(); - } - if (shouldScanKeysValues) + // seek to the first kv on this row + boolean shouldScanKeysValues = this.isSeekToRow? + scanner.seekTo(PrivateCellUtil.createFirstOnRow(this.row)) != -1: scanner.seekTo(); + if (shouldScanKeysValues) { scanKeysValues(file, fileStats, scanner, row); + } } // print meta data @@ -562,14 +585,15 @@ public class HFilePrettyPrinter extends Configured implements Tool { try { out.println("Mid-key: " + reader.midKey().map(CellUtil::getCellKeyAsString)); } catch (Exception e) { - out.println ("Unable to retrieve the midkey"); + out.println("Unable to retrieve the midkey"); } // Printing general bloom information DataInput bloomMeta = reader.getGeneralBloomFilterMetadata(); BloomFilter bloomFilter = null; - if (bloomMeta != null) + if (bloomMeta != null) { bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader); + } out.println("Bloom filter:"); if (bloomFilter != null) { @@ -582,8 +606,9 @@ public class HFilePrettyPrinter extends Configured implements Tool { // Printing delete bloom information bloomMeta = reader.getDeleteBloomFilterMetadata(); bloomFilter = null; - if (bloomMeta != null) + if (bloomMeta != null) { bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader); + } out.println("Delete Family Bloom filter:"); if (bloomFilter != null) { @@ -593,6 +618,33 @@ public class HFilePrettyPrinter extends Configured implements Tool { } else { out.println(FOUR_SPACES + "Not present"); } + DoublesSketch keySizeSketch = + getDoublesSketchFromMetaBlock(reader, HFileWriterImpl.KEYSIZE_SKETCH_KEY_STR); + printDoublesSketch(keySizeSketch, "keySize"); + DoublesSketch valueSizeSketch = + getDoublesSketchFromMetaBlock(reader, HFileWriterImpl.VALUESIZE_SKETCH_KEY_STR); + printDoublesSketch(valueSizeSketch, "valueSize"); + } + + private void printDoublesSketch(DoublesSketch sketch, String name) { + double [] quantiles = sketch.getQuantiles(NORMALIZED_RANKS); + out.println(name + " count=" + sketch.getN() + + ", min=" + sketch.getMinValue() + + ", max=" + sketch.getMaxValue() + + ", 50thPercentile=" + quantiles[50] + + ", 75thPercentile=" + quantiles[75] + + ", 95thPercentile=" + quantiles[95] + + ", quantiles(100)=" + Arrays.toString(quantiles)); + } + + private DoublesSketch getDoublesSketchFromMetaBlock(HFile.Reader reader, String key) + throws IOException { + HFileBlock hfb = reader.getMetaBlock(key,false); + // We wrote w/ an IBW. We are making a copy of bytes when we do the below but having trouble + // passing a ByteBuffer to DataSketches Memory.wrap... doesn't work for me. + ImmutableBytesWritable ibw = new ImmutableBytesWritable(); + ibw.readFields(hfb.getByteStream()); + return DoublesSketch.wrap(Memory.wrap(ibw.get())); } private static class KeyValueStatsCollector { @@ -652,16 +704,14 @@ public class HFilePrettyPrinter extends Configured implements Tool { @Override public String toString() { - if (prevCell == null) + if (prevCell == null) { return "no data available for statistics"; + } // Dump the metrics to the output stream simpleReporter.stop(); simpleReporter.report(); - - return - metricsOutput.toString() + - "Key of biggest row: " + Bytes.toStringBinary(biggestRow); + return metricsOutput.toString() + "Key of biggest row: " + Bytes.toStringBinary(biggestRow); } } @@ -669,7 +719,7 @@ public class HFilePrettyPrinter extends Configured implements Tool { * Almost identical to ConsoleReporter, but extending ScheduledReporter, * as extending ConsoleReporter in this version of dropwizard is now too much trouble. */ - private static class SimpleReporter extends ScheduledReporter { + private static final class SimpleReporter extends ScheduledReporter { /** * Returns a new {@link Builder} for {@link ConsoleReporter}. * @@ -685,7 +735,7 @@ public class HFilePrettyPrinter extends Configured implements Tool { * time zone, writing to {@code System.out}, converting rates to events/second, converting * durations to milliseconds, and not filtering metrics. */ - public static class Builder { + public static final class Builder { private final MetricRegistry registry; private PrintStream output; private Locale locale; diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterImpl.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterImpl.java index dd05963c26..00873bd9a3 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterImpl.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileWriterImpl.java @@ -25,6 +25,8 @@ import java.net.InetSocketAddress; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.List; +import org.apache.datasketches.quantiles.DoublesSketch; +import org.apache.datasketches.quantiles.UpdateDoublesSketch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; @@ -38,6 +40,7 @@ import org.apache.hadoop.hbase.CellUtil; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.KeyValueUtil; import org.apache.hadoop.hbase.PrivateCellUtil; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hadoop.hbase.io.crypto.Encryption; import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding; @@ -53,7 +56,6 @@ import org.apache.hadoop.io.Writable; import org.apache.yetus.audience.InterfaceAudience; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting; /** @@ -84,15 +86,6 @@ public class HFileWriterImpl implements HFile.Writer { /** A "file info" block: a key-value map of file-wide metadata. */ protected HFileInfo fileInfo = new HFileInfo(); - /** Total # of key/value entries, i.e. how many times add() was called. */ - protected long entryCount = 0; - - /** Used for calculating the average key length. */ - protected long totalKeyLength = 0; - - /** Used for calculating the average value length. */ - protected long totalValueLength = 0; - /** Total uncompressed bytes, maybe calculate a compression ratio later. */ protected long totalUncompressedBytes = 0; @@ -163,6 +156,18 @@ public class HFileWriterImpl implements HFile.Writer { protected long maxMemstoreTS = 0; + /** + * Sketch to keep key sizes. + */ + private final UpdateDoublesSketch keySizeSketch = DoublesSketch.builder().build(); + public static final String KEYSIZE_SKETCH_KEY_STR = "keySizeSketch"; + + /** + * Sketch to keep values sizes. + */ + private final UpdateDoublesSketch valueSizeSketch = DoublesSketch.builder().build(); + public static final String VALUESIZE_SKETCH_KEY_STR = "valueSizeSketch"; + public HFileWriterImpl(final Configuration conf, CacheConfig cacheConf, Path path, FSDataOutputStream outputStream, HFileContext fileContext) { this.outputStream = outputStream; @@ -607,7 +612,11 @@ public class HFileWriterImpl implements HFile.Writer { finishBlock(); writeInlineBlocks(true); - FixedFileTrailer trailer = new FixedFileTrailer(getMajorVersion(), getMinorVersion()); + // Serialize out accumulated sketches. + appendMetaBlock(KEYSIZE_SKETCH_KEY_STR, + new ImmutableBytesWritable(this.keySizeSketch.toByteArray())); + appendMetaBlock(VALUESIZE_SKETCH_KEY_STR, + new ImmutableBytesWritable(this.valueSizeSketch.toByteArray())); // Write out the metadata blocks if any. if (!metaNames.isEmpty()) { @@ -637,6 +646,7 @@ public class HFileWriterImpl implements HFile.Writer { // index. long rootIndexOffset = dataBlockIndexWriter.writeIndexBlocks(outputStream); + FixedFileTrailer trailer = new FixedFileTrailer(getMajorVersion(), getMinorVersion()); trailer.setLoadOnOpenOffset(rootIndexOffset); // Meta block index. @@ -745,8 +755,10 @@ public class HFileWriterImpl implements HFile.Writer { blockWriter.write(cell); - totalKeyLength += PrivateCellUtil.estimatedSerializedSizeOfKey(cell); - totalValueLength += cell.getValueLength(); + int keySize = PrivateCellUtil.estimatedSerializedSizeOfKey(cell); + this.keySizeSketch.update(keySize); + int valueSize = cell.getValueLength(); + this.valueSizeSketch.update(valueSize); // Are we the first key in this block? if (firstCellInBlock == null) { @@ -757,7 +769,6 @@ public class HFileWriterImpl implements HFile.Writer { // TODO: What if cell is 10MB and we write infrequently? We hold on to cell here indefinitely? lastCell = cell; - entryCount++; this.maxMemstoreTS = Math.max(this.maxMemstoreTS, cell.getSequenceId()); int tagsLength = cell.getTagsLength(); if (tagsLength > this.maxTagsLength) { @@ -794,16 +805,15 @@ public class HFileWriterImpl implements HFile.Writer { } // Average key length. - int avgKeyLen = - entryCount == 0 ? 0 : (int) (totalKeyLength / entryCount); + int avgKeyLen = (int)this.keySizeSketch.getQuantile(0.5); fileInfo.append(HFileInfo.AVG_KEY_LEN, Bytes.toBytes(avgKeyLen), false); + // Average value length. + int avgValueLen = (int)this.valueSizeSketch.getQuantile(0.5); + fileInfo.append(HFileInfo.AVG_VALUE_LEN, Bytes.toBytes(avgValueLen), false); + fileInfo.append(HFileInfo.CREATE_TIME_TS, Bytes.toBytes(hFileContext.getFileCreateTime()), false); - // Average value length. - int avgValueLen = - entryCount == 0 ? 0 : (int) (totalValueLength / entryCount); - fileInfo.append(HFileInfo.AVG_VALUE_LEN, Bytes.toBytes(avgValueLen), false); if (hFileContext.isIncludesTags()) { // When tags are not being written in this file, MAX_TAGS_LEN is excluded // from the FileInfo @@ -836,7 +846,7 @@ public class HFileWriterImpl implements HFile.Writer { // Now we can finish the close trailer.setMetaIndexCount(metaNames.size()); trailer.setTotalUncompressedBytes(totalUncompressedBytes+ trailer.getTrailerSize()); - trailer.setEntryCount(entryCount); + trailer.setEntryCount(this.keySizeSketch.getN()); trailer.setCompressionCodec(hFileContext.getCompression()); long startTime = System.currentTimeMillis(); diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java index 8fab5a3df8..69f27c056f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java @@ -87,7 +87,8 @@ public class TestHFilePrettyPrinter { new HFilePrettyPrinter(conf).run(new String[]{"-v", String.valueOf(fileNotInRootDir)}); String result = new String(stream.toByteArray()); String expectedResult = "Scanning -> " + fileNotInRootDir + "\n" + "Scanned kv count -> 1000\n"; - assertEquals(expectedResult, result); + LOG.info(result); + assertEquals(result, expectedResult, result); } @Test @@ -108,4 +109,17 @@ public class TestHFilePrettyPrinter { String expectedResult = "Scanning -> " + fileInRootDir + "\n" + "Scanned kv count -> 1000\n"; assertEquals(expectedResult, result); } + + @Test + public void testHFilePrettyPrinterMetaData() throws Exception { + Path f = UTIL.getDataTestDir("metadata"); + TestHRegionServerBulkLoad.createHFile(fs, f, cf, fam, value, 10); + assertNotEquals("directory used is not an HBase root dir", + UTIL.getDefaultRootDirPath(), f); + + System.setOut(ps); + new HFilePrettyPrinter(conf).run(new String[]{"-m", String.valueOf(f)}); + String result = new String(stream.toByteArray()); + LOG.info(result); + } } diff --git a/hbase-shaded/hbase-shaded-mapreduce/pom.xml b/hbase-shaded/hbase-shaded-mapreduce/pom.xml index 7ccabde90f..770857f0c3 100644 --- a/hbase-shaded/hbase-shaded-mapreduce/pom.xml +++ b/hbase-shaded/hbase-shaded-mapreduce/pom.xml @@ -164,6 +164,10 @@ javax.servlet.jsp javax.servlet.jsp-api + + org.apache.datasketches + datasketches-java + diff --git a/pom.xml b/pom.xml index 1522f2eb91..47c2858ec8 100755 --- a/pom.xml +++ b/pom.xml @@ -2255,6 +2255,11 @@ + + org.apache.datasketches + datasketches-java + 1.3.0-incubating + @@ -3378,6 +3383,11 @@ hadoop-hdfs-client ${hadoop-three.version} + + org.apache.datasketches + datasketches-java + 1.3.0-incubating + -- 2.24.0