Index: src/main/resources/hbase-default.xml =================================================================== --- src/main/resources/hbase-default.xml (revision 996633) +++ src/main/resources/hbase-default.xml (working copy) @@ -404,11 +404,16 @@ - hfile.min.blocksize.size + hbase.mapreduce.hfileoutputformat.blocksize 65536 - Minimum store file block size. The smaller you make this, the - bigger your index and the less you fetch on a random-access. Set size down - if you have small cells and want faster random-access of individual cells. + The mapreduce HFileOutputFormat writes storefiles/hfiles. + This is the minimum hfile blocksize to emit. Usually in hbase, writing + hfiles, the blocksize is gotten from the table schema (HColumnDescriptor) + but in the mapreduce outputformat context, we don't have access to the + schema so get blocksize from Configuation. The smaller you make + the blocksize, the bigger your index and the less you fetch on a + random-access. Set the blocksize down if you have small cells and want + faster random-access of individual cells. Index: src/main/java/org/apache/hadoop/hbase/HColumnDescriptor.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/HColumnDescriptor.java (revision 996633) +++ src/main/java/org/apache/hadoop/hbase/HColumnDescriptor.java (working copy) @@ -73,7 +73,14 @@ public static final String COMPRESSION = "COMPRESSION"; public static final String BLOCKCACHE = "BLOCKCACHE"; + + /** + * Size of storefile/hfile 'blocks'. Default is {@link #DEFAULT_BLOCKSIZE}. + * Use smaller block sizes for faster random-access at expense of larger + * indices (more memory consumption). + */ public static final String BLOCKSIZE = "BLOCKSIZE"; + public static final String LENGTH = "LENGTH"; public static final String TTL = "TTL"; public static final String BLOOMFILTER = "BLOOMFILTER"; @@ -108,8 +115,7 @@ public static final boolean DEFAULT_BLOCKCACHE = true; /** - * Default size of blocks in files store to the filesytem. Use smaller for - * faster random-access at expense of larger indices (more memory consumption). + * Default size of blocks in files stored to the filesytem (hfiles). */ public static final int DEFAULT_BLOCKSIZE = HFile.DEFAULT_BLOCKSIZE; @@ -222,7 +228,9 @@ * @param inMemory If true, column data should be kept in an HRegionServer's * cache * @param blockCacheEnabled If true, MapFile blocks should be cached - * @param blocksize + * @param blocksize Block size to use when writing out storefiles. Use + * smaller blocksizes for faster random-access at expense of larger indices + * (more memory consumption). Default is usually 64k. * @param timeToLive Time-to-live of cell contents, in seconds * (use HConstants.FOREVER for unlimited TTL) * @param bloomFilter Bloom filter type for this column @@ -374,7 +382,7 @@ } /** - * @return Blocksize. + * @return The storefile/hfile blocksize for this column family. */ public synchronized int getBlocksize() { if (this.blocksize == null) { @@ -386,7 +394,8 @@ } /** - * @param s + * @param s Blocksize to use when writing out storefiles/hfiles on this + * column family. */ public void setBlocksize(int s) { setValue(BLOCKSIZE, Integer.toString(s)); Index: src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java =================================================================== --- src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java (revision 996633) +++ src/main/java/org/apache/hadoop/hbase/mapreduce/HFileOutputFormat.java (working copy) @@ -75,7 +75,8 @@ final FileSystem fs = outputdir.getFileSystem(conf); // These configs. are from hbase-*.xml final long maxsize = conf.getLong("hbase.hregion.max.filesize", 268435456); - final int blocksize = conf.getInt("hfile.min.blocksize.size", 65536); + final int blocksize = + conf.getInt("hbase.mapreduce.hfileoutputformat.blocksize", 65536); // Invented config. Add to hbase-*.xml if other than default compression. final String compression = conf.get("hfile.compression", Compression.Algorithm.NONE.getName());