diff --git metastore/src/java/org/apache/hadoop/hive/metastore/TransactionalValidationListener.java metastore/src/java/org/apache/hadoop/hive/metastore/TransactionalValidationListener.java index 3a3d184b65..8c97fc0306 100644 --- metastore/src/java/org/apache/hadoop/hive/metastore/TransactionalValidationListener.java +++ metastore/src/java/org/apache/hadoop/hive/metastore/TransactionalValidationListener.java @@ -17,11 +17,18 @@ */ package org.apache.hadoop.hive.metastore; +import java.io.IOException; +import java.nio.file.Files; import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.hive.metastore.api.InvalidOperationException; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; @@ -121,9 +128,10 @@ private void handleAlterTableTransactionalProp(PreAlterTableEvent context) throw } if (newTable.getTableType().equals(TableType.EXTERNAL_TABLE.toString())) { - throw new MetaException(newTable.getDbName() + "." + newTable.getTableName() + + throw new MetaException(getTableName(newTable) + " cannot be declared transactional because it's an external table"); } + validateTableStructure(newTable); hasValidTransactionalValue = true; } @@ -283,4 +291,58 @@ private String validateTransactionalProperties(String transactionalProperties) { } return null; // All checks passed, return null. } + private int countDepth(Path p) { + int depth = 1; + while(!p.isRoot()) { + depth++; + p = p.getParent(); + } + return depth; + } + private final Pattern ORIGINAL_PATTERN = Pattern.compile("[0-9]+_[0-9]+"); + /** + * @see org.apache.hadoop.hive.ql.exec.Utilities#COPY_KEYWORD + */ + private static final Pattern ORIGINAL_PATTERN_COPY = + Pattern.compile("[0-9]+_[0-9]+" + "_copy_" + "[0-9]+"); + + /** + * Non-acid to acid bucketed table conversion can only handle tables with "standard" data + * layout. This means that bucket files are named according to {@link #ORIGINAL_PATTERN} or + * {@link #ORIGINAL_PATTERN_COPY} and are located at the root of partition dir (or table root + * if table is not partitioned). + * {@link OrcRawRecorMerger} is where the conversion logic is. + */ + private void validateTableStructure(Table table) throws MetaException { + final boolean isBucketed = table.getSd().getBucketColsSize() > 0; + Path tablePath = new Path(table.getSd().getLocation()); + int rootDepth = countDepth(tablePath); + int partitionDepth = table.getPartitionKeysSize();//this is number of partition cols + int expectedFileDepth = rootDepth + partitionDepth + 1; + try { + FileSystem fs = FileSystem.get(getConf()); + RemoteIterator iterator = fs.listFiles(tablePath, true); + while (iterator.hasNext()) { + LocatedFileStatus fileStatus = iterator.next(); + if (!fileStatus.isFile()) { + continue; + } + int fileDepth = countDepth(fileStatus.getPath()); + boolean validFile = expectedFileDepth == fileDepth && + (ORIGINAL_PATTERN.matcher(fileStatus.getPath().getName()).matches() || + ORIGINAL_PATTERN_COPY.matcher(fileStatus.getPath().getName()).matches() + ); + if (!validFile) { + throw new IllegalStateException("Unexpected data file name/location. Data files must be " + + "at root of table/partition. Cannot convert " + getTableName(table) + + " to transactional table. File: " + fileStatus.getPath()); + } + } + } catch (IOException e) { + throw new MetaException("Unable to list files for " + getTableName(table)); + } + } + private static String getTableName(Table table) { + return table.getDbName() + "." + table.getTableName(); + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java index cbbb4c47ff..62ee587ee9 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java @@ -24,6 +24,7 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; +import org.apache.hadoop.hive.ql.exec.AbstractFileMergeOperator; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.AcidOutputFormat; import org.apache.hadoop.hive.ql.io.BucketCodec; @@ -296,9 +297,11 @@ public void next(OrcStruct next) throws IOException { * of these files as part of a single logical bucket file. * * Also, for unbucketed (non acid) tables, there are no guarantees where data files may be placed. - * For example, CTAS+Tez+Union creates subdirs 1/, 2/, etc for each leg of the Union. Thus the - * data file need not be an immediate child of partition dir. All files for a given writerId are - * treated as one logical unit to assign {@link RecordIdentifier}s to them consistently. + * For example, CTAS+Tez+Union creates subdirs + * {@link AbstractFileMergeOperator#UNION_SUDBIR_PREFIX}_1/, + * {@link AbstractFileMergeOperator#UNION_SUDBIR_PREFIX}_2/, etc for each leg of the Union. Thus + * the data file need not be an immediate child of partition dir. All files for a given writerId + * are treated as one logical unit to assign {@link RecordIdentifier}s to them consistently. * * For Compaction, where each split includes the whole bucket, this means reading over all the * files in order to assign ROW__ID.rowid in one sequence for the entire logical bucket.