diff --git a/pom.xml b/pom.xml index 880f2354e5..96920ff5f6 100644 --- a/pom.xml +++ b/pom.xml @@ -194,7 +194,7 @@ 1.7.10 4.0.4 3.0.0-SNAPSHOT - 0.9.1-SNAPSHOT + 0.9.1 0.92.0-incubating 2.2.0 2.2.0 diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index be93d19f60..09737fbb1b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -34,7 +34,6 @@ import java.util.Map; import java.util.NavigableMap; import java.util.TreeMap; -import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.CompletionService; import java.util.concurrent.ExecutorCompletionService; @@ -367,73 +366,6 @@ public static boolean isOriginal(Footer footer) { return result; } - /** - * Generate list of columnId which is to be read from the file. - * - * @param readerSchema Schema of the file from where read is to be performed - * @param included list of top level column which is to be read - * @param conf Hive conf to access conf "hive.io.file.readNestedColumn.paths" - * @return boolean array which corresponds to list of columnIds need to be read. - */ - private static boolean[] genIncludedColumns(Configuration conf, TypeDescription readerSchema, - List included) { - boolean[] result = new boolean[readerSchema.getMaximumId() + 1]; - if (included != null) { - /* Include nested column only which are present in conf "hive.io.file.readNestedColumn.paths" */ - Set nestedColumnPaths = ColumnProjectionUtils.getNestedColumnPaths(conf); - if (nestedColumnPaths.size() != 0) { - result[0] = true; - for (String column : nestedColumnPaths) { - String[] columnPath = column.split("\\."); - result = setIncludeForNestedColumns(columnPath, 0, readerSchema, result); - } - } else { - /* This is a fail-safe in-case we fail to obtain nested column paths correctly */ - result = genIncludedColumns(readerSchema, included); - } - } else { - /* Included will be null in select * scenario and hence filling all as true */ - Arrays.fill(result, true); - } - return result; - } - - /** - * Convert ColumnPath to ColumnId and set ColumnId in Include boolean array to true. - * - * @param columnPath "a.b.c" - * @param position index counter of columnPath field. - * @param readerSchema schema in which column name is to searched. - * @param include boolean array indicate which all columns are needed to be read from file. - * @return filled "include" boolean array. - */ - private static boolean[] setIncludeForNestedColumns(String[] columnPath, int position, - TypeDescription readerSchema, boolean[] include) { - if (position == (columnPath.length) && readerSchema.getChildren() != null) { - /* If the column path is "a.b.c". If c is nested structure then set true for all the children columns. */ - for (int col = readerSchema.getId(); col <= readerSchema.getMaximumId(); ++col) { - include[col] = true; - } - } else if (position == (columnPath.length) && readerSchema.getChildren() == null) { - /* If the column path is "a.b.c". If c is a column then set true for column c. */ - include[readerSchema.getId()] = true; - } else { - /* - * If the column Path is "a.b.c". - * Then set true for a, b and c columns in depth first search fashion. - * */ - int fieldId = 0; - String columnName = columnPath[position]; - while (!columnName.equalsIgnoreCase(readerSchema.getFieldNames().get(fieldId))) { - fieldId++; - } - TypeDescription childSchema = readerSchema.getChildren().get(fieldId); - include = setIncludeForNestedColumns(columnPath, ++position, childSchema, include); - include[childSchema.getId()] = true; - } - return include; - } - /** * Reverses genIncludedColumns; produces the table columns indexes from ORC included columns. * @param readerSchema The ORC reader schema for the table. @@ -480,9 +412,9 @@ public static boolean isOriginal(Footer footer) { */ static boolean[] genIncludedColumns(TypeDescription readerSchema, Configuration conf) { - if (!ColumnProjectionUtils.isReadAllColumns(conf)) { + if (!ColumnProjectionUtils.isReadAllColumns(conf)) { List included = ColumnProjectionUtils.getReadColumnIDs(conf); - return genIncludedColumns(conf, readerSchema, included); + return genIncludedColumns(readerSchema, included); } else { return null; } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index aaefafaa61..cec1d34535 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -1706,80 +1706,6 @@ public void testProjectedColumnSize() throws Exception { assertEquals(376804, result.getProjectedColumnsUncompressedSize()); } - - @Test - public void testProjectedColumnSizeWithNestedConf() throws Exception { - long[] stripeSizes = - new long[]{200, 200, 200, 200, 100}; - MockFileSystem fs = new MockFileSystem(conf, - new MockFile("mock:/a/file", 500, - createMockOrcFile(stripeSizes), - new MockBlock("host1-1", "host1-2", "host1-3"), - new MockBlock("host2-1", "host0", "host2-3"), - new MockBlock("host0", "host3-2", "host3-3"), - new MockBlock("host4-1", "host4-2", "host4-3"), - new MockBlock("host5-1", "host5-2", "host5-3"))); - HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 300); - HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 200); - conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); - //conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); - conf.set(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR, "col1"); - OrcInputFormat.Context context = new OrcInputFormat.Context(conf); - OrcInputFormat.SplitGenerator splitter = - new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, - fs.getFileStatus(new Path("/a/file")), null, null, true, - new ArrayList(), true, null, null), null, true, true); - List results = splitter.call(); - OrcSplit result = results.get(0); - assertEquals(3, results.size()); - assertEquals(3, result.getStart()); - assertEquals(400, result.getLength()); - assertEquals(167468, result.getProjectedColumnsUncompressedSize()); - result = results.get(1); - assertEquals(403, result.getStart()); - assertEquals(400, result.getLength()); - assertEquals(167468, result.getProjectedColumnsUncompressedSize()); - result = results.get(2); - assertEquals(803, result.getStart()); - assertEquals(100, result.getLength()); - assertEquals(41867, result.getProjectedColumnsUncompressedSize()); - - // test min = 0, max = 0 generates each stripe - HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 0); - HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 0); - context = new OrcInputFormat.Context(conf); - splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, - fs.getFileStatus(new Path("/a/file")), null, null, true, - new ArrayList(), - true, null, null), null, true, true); - results = splitter.call(); - assertEquals(5, results.size()); - for (int i = 0; i < stripeSizes.length; ++i) { - assertEquals("checking stripe " + i + " size", - stripeSizes[i], results.get(i).getLength()); - if (i == stripeSizes.length - 1) { - assertEquals(41867, results.get(i).getProjectedColumnsUncompressedSize()); - } else { - assertEquals(83734, results.get(i).getProjectedColumnsUncompressedSize()); - } - } - - // single split - HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 1000); - HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 100000); - context = new OrcInputFormat.Context(conf); - splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, - fs.getFileStatus(new Path("/a/file")), null, null, true, - new ArrayList(), - true, null, null), null, true, true); - results = splitter.call(); - assertEquals(1, results.size()); - result = results.get(0); - assertEquals(3, result.getStart()); - assertEquals(900, result.getLength()); - assertEquals(376804, result.getProjectedColumnsUncompressedSize()); - } - @Test public void testInOutFormat() throws Exception { Properties properties = new Properties(); @@ -4024,115 +3950,4 @@ public void testColumnProjectionWithAcid() throws Exception { assertEquals(1000, record); reader.close(); } - - @Test - public void testColumnProjectionWithAcidWithNestedConf() throws Exception { - Path baseDir = new Path(workDir, "base_00100"); - testFilePath = new Path(baseDir, "bucket_00000"); - fs.mkdirs(baseDir); - fs.delete(testFilePath, true); - TypeDescription fileSchema = - TypeDescription.fromString("struct,d:string>>"); - Writer writer = OrcFile.createWriter(testFilePath, - OrcFile.writerOptions(conf) - .fileSystem(fs) - .setSchema(fileSchema) - .compress(org.apache.orc.CompressionKind.NONE)); - VectorizedRowBatch batch = fileSchema.createRowBatch(1000); - batch.size = 1000; - StructColumnVector scv = (StructColumnVector)batch.cols[5]; - // operation - batch.cols[0].isRepeating = true; - ((LongColumnVector) batch.cols[0]).vector[0] = 0; - // original transaction - batch.cols[1].isRepeating = true; - ((LongColumnVector) batch.cols[1]).vector[0] = 1; - // bucket - batch.cols[2].isRepeating = true; - ((LongColumnVector) batch.cols[2]).vector[0] = 0; - // current transaction - batch.cols[4].isRepeating = true; - ((LongColumnVector) batch.cols[4]).vector[0] = 1; - - LongColumnVector lcv = (LongColumnVector) - ((StructColumnVector) scv.fields[1]).fields[0]; - for(int r=0; r < 1000; r++) { - // row id - ((LongColumnVector) batch.cols[3]).vector[r] = r; - // a - ((LongColumnVector) scv.fields[0]).vector[r] = r * 42; - // b.c - lcv.vector[r] = r * 10001; - // d - ((BytesColumnVector) scv.fields[2]).setVal(r, - Integer.toHexString(r).getBytes(StandardCharsets.UTF_8)); - } - writer.addRowBatch(batch); - writer.addUserMetadata(OrcRecordUpdater.ACID_KEY_INDEX_NAME, - ByteBuffer.wrap("0,0,999".getBytes(StandardCharsets.UTF_8))); - writer.close(); - long fileLength = fs.getFileStatus(testFilePath).getLen(); - - // test with same schema with include - conf.set(ValidTxnList.VALID_TXNS_KEY, "100:99:"); - conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d"); - conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct,string"); - conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false"); - conf.set(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR, "a,d"); -// conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2"); - OrcSplit split = new OrcSplit(testFilePath, null, 0, fileLength, - new String[0], null, false, true, - new ArrayList(), fileLength, fileLength, workDir); - OrcInputFormat inputFormat = new OrcInputFormat(); - AcidInputFormat.RowReader reader = inputFormat.getReader(split, - new AcidInputFormat.Options(conf)); - int record = 0; - RecordIdentifier id = reader.createKey(); - OrcStruct struct = reader.createValue(); - while (reader.next(id, struct)) { - assertEquals("id " + record, record, id.getRowId()); - assertEquals("bucket " + record, 0, id.getBucketProperty()); - assertEquals("trans " + record, 1, id.getTransactionId()); - assertEquals("a " + record, - 42 * record, ((IntWritable) struct.getFieldValue(0)).get()); - assertEquals(null, struct.getFieldValue(1)); - assertEquals("d " + record, - Integer.toHexString(record), struct.getFieldValue(2).toString()); - record += 1; - } - assertEquals(1000, record); - reader.close(); - - // test with schema evolution and include - conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d,f"); - conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct,string,int"); - conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false"); -// conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2,3"); - conf.set(ColumnProjectionUtils.READ_NESTED_COLUMN_PATH_CONF_STR, "a,d,fmvn checkstyle:checkstyle-aggregate"); - split = new OrcSplit(testFilePath, null, 0, fileLength, - new String[0], null, false, true, - new ArrayList(), fileLength, fileLength, workDir); - inputFormat = new OrcInputFormat(); - reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf)); - record = 0; - id = reader.createKey(); - struct = reader.createValue(); - while (reader.next(id, struct)) { - assertEquals("id " + record, record, id.getRowId()); - assertEquals("bucket " + record, 0, id.getBucketProperty()); - assertEquals("trans " + record, 1, id.getTransactionId()); - assertEquals("a " + record, - 42 * record, ((IntWritable) struct.getFieldValue(0)).get()); - assertEquals(null, struct.getFieldValue(1)); - assertEquals("d " + record, - Integer.toHexString(record), struct.getFieldValue(2).toString()); - assertEquals("f " + record, null, struct.getFieldValue(3)); - record += 1; - } - assertEquals(1000, record); - reader.close(); - } }