diff --git common/src/java/org/apache/hadoop/hive/conf/HiveConf.java common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index a6ecb373d8..179661d1d5 100644 --- common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1896,9 +1896,7 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal "Sets the operational properties that control the appropriate behavior for various\n" + "versions of the Hive ACID subsystem. Mostly it is intended to be used as an internal property\n" + "for future versions of ACID. (See HIVE-14035 for details.)\n" - + "0: Turn on the legacy mode for ACID\n" + "1: Enable split-update feature found in the newer version of Hive ACID subsystem\n" - + "2: Hash-based merge, which combines delta files using GRACE hash join based approach (not implemented)\n" + "3: Make the table 'quarter-acid' as it only supports insert. But it doesn't require ORC or bucketing.\n" + "This is intended to be used as an internal property for future versions of ACID. (See\n" + "HIVE-14035 for details.)"), diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index a08163823c..e9369daaf8 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -468,6 +468,7 @@ minillaplocal.query.files=\ acid_no_buckets.q, \ acid_globallimit.q,\ acid_vectorization_missing_cols.q,\ + acid_vectorization_original.q,\ alter_merge_stats_orc.q,\ authorization_view_8.q,\ auto_join30.q,\ diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index c364343528..4e86b03a91 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -1869,7 +1869,13 @@ private static void scheduleSplits(ETLSplitStrategy splitStrategy, Context conte reporter.setStatus(inputSplit.toString()); - boolean isFastVectorizedReaderAvailable = + //TODO: why would inputSplit be something other than OrcSplit? If that is really possible we + //have to retain VectorizedOrcAcidRowReader or make VectorizedOrcAcidRowBatchReader handle + //non orc splits + //Nothing in Acid can work with FileSplit - it needs OrcSplit so at least for Acid we get rid + //of (is OrcSplit) checks. FileSplit (which is not OrcSplit) can be a result of CombineInputFormat which + //should be disabled somewhere for Acid path + boolean isFastVectorizedReaderAvailable = vectorMode && VectorizedOrcAcidRowBatchReader.canCreateVectorizedAcidRowBatchReaderOnSplit(conf, inputSplit); if (vectorMode && isFastVectorizedReaderAvailable) { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java index 1e19a911a6..461a5d7525 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java @@ -62,21 +62,22 @@ private static final Logger LOG = LoggerFactory.getLogger(OrcRecordUpdater.class); - public static final String ACID_KEY_INDEX_NAME = "hive.acid.key.index"; - public static final String ACID_FORMAT = "_orc_acid_version"; - public static final int ORC_ACID_VERSION = 0; + static final String ACID_KEY_INDEX_NAME = "hive.acid.key.index"; + private static final String ACID_FORMAT = "_orc_acid_version"; + private static final int ORC_ACID_VERSION = 0; final static int INSERT_OPERATION = 0; final static int UPDATE_OPERATION = 1; final static int DELETE_OPERATION = 2; - + //column indexes of corresponding data in storage layer final static int OPERATION = 0; final static int ORIGINAL_TRANSACTION = 1; final static int BUCKET = 2; final static int ROW_ID = 3; final static int CURRENT_TRANSACTION = 4; final static int ROW = 5; + //total number of fields (above) final static int FIELDS = 6; final static int DELTA_BUFFER_SIZE = 16 * 1024; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java index 260a5ac3e2..8d227bf3a4 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java @@ -52,10 +52,13 @@ private static final Logger LOG = LoggerFactory.getLogger(OrcSplit.class); private OrcTail orcTail; private boolean hasFooter; + //todo: what is this set to if not acid read? private boolean isOriginal; + //todo: is this always true in 2.0? shoudl be private boolean hasBase; //partition root private Path rootDir; + //todo: rename - these should be delete deltas in 2.0 private final List deltas = new ArrayList<>(); private long projColsUncompressedSize; private transient Object fileKey; @@ -228,6 +231,11 @@ public long getColumnarProjectionSize() { return projColsUncompressedSize; } + /** + * todo: the logic here needs work + * @param conf + * @return + */ @Override public boolean canUseLlapIo(Configuration conf) { final boolean hasDelta = deltas != null && !deltas.isEmpty(); @@ -238,6 +246,7 @@ public boolean canUseLlapIo(Configuration conf) { final AcidUtils.AcidOperationalProperties acidOperationalProperties = AcidUtils.getAcidOperationalProperties(conf); final boolean isSplitUpdate = acidOperationalProperties.isSplitUpdate(); + assert isSplitUpdate : "should be true in Hive 3.0"; if (isOriginal) { if (!isAcidRead && !hasDelta) { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java index 1e16f09bc7..98b4d3d512 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowBatchReader.java @@ -37,9 +37,13 @@ import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; +import org.apache.hadoop.hive.ql.io.AcidInputFormat; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.BucketCodec; import org.apache.hadoop.hive.ql.io.RecordIdentifier; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; +import org.apache.hadoop.hive.shims.HadoopShims; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; @@ -57,32 +61,61 @@ * directly read from the base files/insert_only deltas in vectorized row batches. The deleted * rows can then be easily indicated via the 'selected' field of the vectorized row batch. * Refer HIVE-14233 for more details. + * + * + * todo: annotate the plan to indicate which reader is used? + * better yet remove the VectorizedOrcAcidRowReader - then there is just 1 vectorized path + * Also, figure out how to make it not use LLAP cache for original files with ROW__ID projection + * This way everything still vectorizes + * todo: add test that loads a lot of data, like 100k to multiple stripes/row groups + * add some checksum queries to make sure ROW__IDs are correct/distinct etc + * Add a few queries with predicates to make sure ROW__IDs are correct, i.e. that we assign them + * consistently + * Can you just convert 100k table to acid? maybe not - will screw up other tests + * */ public class VectorizedOrcAcidRowBatchReader implements org.apache.hadoop.mapred.RecordReader { private static final Logger LOG = LoggerFactory.getLogger(VectorizedOrcAcidRowBatchReader.class); - public org.apache.hadoop.mapred.RecordReader baseReader; - protected VectorizedRowBatchCtx rbCtx; - protected VectorizedRowBatch vectorizedRowBatchBase; + private org.apache.hadoop.mapred.RecordReader baseReader; + private final VectorizedRowBatchCtx rbCtx; + private VectorizedRowBatch vectorizedRowBatchBase; private long offset; private long length; protected float progress = 0.0f; protected Object[] partitionValues; - protected boolean addPartitionCols = true; - private ValidTxnList validTxnList; - protected DeleteEventRegistry deleteEventRegistry; - protected StructColumnVector recordIdColumnVector; - private org.apache.orc.Reader.Options readerOptions; + private boolean addPartitionCols = true; + private final ValidTxnList validTxnList; + private final DeleteEventRegistry deleteEventRegistry; + private final StructColumnVector recordIdColumnVector; + private final Reader.Options readerOptions; + private final boolean isOriginal; + /** + * something further in the data pipeline wants {@link VirtualColumn#ROWID} + */ + private final boolean rowIdProjected; + //partition root + private final Path rootPath; + /** + * for reading "original" files + */ + private final OffsetAndBucketProperty syntheticProps; + private RecordReader innerReader; - public VectorizedOrcAcidRowBatchReader(InputSplit inputSplit, JobConf conf, - Reporter reporter) throws IOException { - this.init(inputSplit, conf, reporter, Utilities.getVectorizedRowBatchCtx(conf)); + VectorizedOrcAcidRowBatchReader(InputSplit inputSplit, JobConf conf, + Reporter reporter) throws IOException { + this(inputSplit, conf,reporter, null); + } + @VisibleForTesting + VectorizedOrcAcidRowBatchReader(InputSplit inputSplit, JobConf conf, + Reporter reporter, VectorizedRowBatchCtx rbCtx) throws IOException { + this(conf, inputSplit, reporter, rbCtx == null ? Utilities.getVectorizedRowBatchCtx(conf) : rbCtx); final Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, (OrcSplit) inputSplit); // Careful with the range here now, we do not want to read the whole base file like deltas. - final RecordReader innerReader = reader.rowsOptions(readerOptions.range(offset, length)); + innerReader = reader.rowsOptions(readerOptions.range(offset, length)); baseReader = new org.apache.hadoop.mapred.RecordReader() { @Override @@ -121,12 +154,13 @@ public float getProgress() throws IOException { public VectorizedOrcAcidRowBatchReader(InputSplit inputSplit, JobConf conf, Reporter reporter, org.apache.hadoop.mapred.RecordReader baseReader, VectorizedRowBatchCtx rbCtx) throws IOException { - this.init(inputSplit, conf, reporter, rbCtx); + this(conf, inputSplit, reporter, rbCtx); this.baseReader = baseReader; + this.innerReader = null; this.vectorizedRowBatchBase = baseReader.createValue(); } - private void init(InputSplit inputSplit, JobConf conf, Reporter reporter, + private VectorizedOrcAcidRowBatchReader(JobConf conf, InputSplit inputSplit, Reporter reporter, VectorizedRowBatchCtx rowBatchCtx) throws IOException { this.rbCtx = rowBatchCtx; final boolean isAcidRead = HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN); @@ -143,8 +177,7 @@ private void init(InputSplit inputSplit, JobConf conf, Reporter reporter, final OrcSplit orcSplit = (OrcSplit) inputSplit; reporter.setStatus(orcSplit.toString()); - readerOptions = OrcInputFormat.createOptionsForReader(conf); - readerOptions = OrcRawRecordMerger.createEventOptions(readerOptions); + readerOptions = OrcRawRecordMerger.createEventOptions(OrcInputFormat.createOptionsForReader(conf)); this.offset = orcSplit.getStart(); this.length = orcSplit.getLength(); @@ -167,46 +200,119 @@ private void init(InputSplit inputSplit, JobConf conf, Reporter reporter, deleteEventReaderOptions.range(0, Long.MAX_VALUE); // Disable SARGs for deleteEventReaders, as SARGs have no meaning. deleteEventReaderOptions.searchArgument(null, null); + DeleteEventRegistry der = null; try { // See if we can load all the delete events from all the delete deltas in memory... - this.deleteEventRegistry = new ColumnizedDeleteEventRegistry(conf, orcSplit, deleteEventReaderOptions); + der = new ColumnizedDeleteEventRegistry(conf, orcSplit, deleteEventReaderOptions); } catch (DeleteEventsOverflowMemoryException e) { // If not, then create a set of hanging readers that do sort-merge to find the next smallest // delete event on-demand. Caps the memory consumption to (some_const * no. of readers). - this.deleteEventRegistry = new SortMergedDeleteEventRegistry(conf, orcSplit, deleteEventReaderOptions); + der = new SortMergedDeleteEventRegistry(conf, orcSplit, deleteEventReaderOptions); } - - recordIdColumnVector = new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE, null, null, null); + this.deleteEventRegistry = der; + isOriginal = orcSplit.isOriginal(); + if(isOriginal) { + recordIdColumnVector = new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE, + new LongColumnVector(), new LongColumnVector(), new LongColumnVector()); + } + else { + recordIdColumnVector = new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE, null, null, null); + } + rowIdProjected = areRowIdsProjected(rbCtx); + rootPath = orcSplit.getRootDir(); + syntheticProps = computeOffsetAndBucket(orcSplit, conf, validTxnList); } /** + * Used for generating synthetic ROW__IDs for reading "original" files + */ + private static final class OffsetAndBucketProperty { + private final long rowIdOffset; + private final int bucketProperty; + private OffsetAndBucketProperty(long rowIdOffset, int bucketProperty) { + this.rowIdOffset = rowIdOffset; + this.bucketProperty = bucketProperty; + } + } + /** + * See {@link #next(NullWritable, VectorizedRowBatch)} fist and + * {@link OrcRawRecordMerger.OriginalReaderPair}. + * When reading a split of an "original" file and we need to decorate data with ROW__ID. + * This requires handling multiple files that are part of the same bucket (tranche for unbucketed + * tables) as a single logical file to number rowids consistently. + * + * This logic is executed per split of every "original" file. The computed result is the same + * for every split form the same file so this could be optimized by moving it to before/during + * splt computation and passing the info in the split. ToDo: file Jira for this + */ + private OffsetAndBucketProperty computeOffsetAndBucket( + OrcSplit split, JobConf conf,ValidTxnList validTxnList) throws IOException { + if(!(split.isOriginal() && (rowIdProjected || !deleteEventRegistry.isEmpty()))) { + return new OffsetAndBucketProperty(0,0); + } + long rowIdOffset = 0; + int bucketId = AcidUtils.parseBaseOrDeltaBucketFilename(split.getPath(), conf).getBucketId(); + int bucketProperty = BucketCodec.V1.encode(new AcidOutputFormat.Options(conf).statementId(0).bucket(bucketId)); + AcidUtils.Directory directoryState = AcidUtils.getAcidState(split.getRootDir(), conf, + validTxnList, false, true); + for (HadoopShims.HdfsFileStatusWithId f : directoryState.getOriginalFiles()) { + AcidOutputFormat.Options bucketOptions = + AcidUtils.parseBaseOrDeltaBucketFilename(f.getFileStatus().getPath(), conf); + if (bucketOptions.getBucketId() != bucketId) { + continue; + } + if (f.getFileStatus().getPath().equals(split.getPath())) { + //'f' is the file whence this split is + break; + } + Reader reader = OrcFile.createReader(f.getFileStatus().getPath(), + OrcFile.readerOptions(conf)); + rowIdOffset += reader.getNumberOfRows(); + } + return new OffsetAndBucketProperty(rowIdOffset, bucketProperty); + } + /** * Returns whether it is possible to create a valid instance of this class for a given split. * @param conf is the job configuration - * @param inputSplit * @return true if it is possible, else false. */ - public static boolean canCreateVectorizedAcidRowBatchReaderOnSplit(JobConf conf, InputSplit inputSplit) { + static boolean canCreateVectorizedAcidRowBatchReaderOnSplit(JobConf conf, InputSplit inputSplit) + throws IOException { if (!(inputSplit instanceof OrcSplit)) { return false; // must be an instance of OrcSplit. } - // First check if we are reading any original files in the split. - // To simplify the vectorization logic, the vectorized acid row batch reader does not handle - // original files for now as they have a different schema than a regular ACID file. final OrcSplit split = (OrcSplit) inputSplit; - if (AcidUtils.getAcidOperationalProperties(conf).isSplitUpdate() && !split.isOriginal()) { - // When split-update is turned on for ACID, a more optimized vectorized batch reader - // can be created. But still only possible when we are *NOT* reading any originals. - return true; + if(Utilities.getVectorizedRowBatchCtx(conf) == null) { + //expected that check if vectorization is possible has already happened + throw new IllegalStateException("Could not create VectorizedRowBatchCtx for " + + split.getRootDir()); } - return false; // no split-update or possibly reading originals! + return true; } + private static boolean areRowIdsProjected(VectorizedRowBatchCtx rbCtx) { + if(rbCtx.getVirtualColumnCount() == 0) { + return false; + } + for(VirtualColumn vc : rbCtx.getNeededVirtualColumns()) { + if(vc == VirtualColumn.ROWID) { + //The query needs ROW__ID: maybe explicitly asked, maybe it's part of + // Update/Delete statement. + //Either way, we need to decorate "original" rows with row__id + return true; + } + } + return false; + } + /** + * ToDo: refactor/merge with {@link OrcInputFormat#getReader(InputSplit, AcidInputFormat.Options)} + */ private static Path[] getDeleteDeltaDirsFromSplit(OrcSplit orcSplit) throws IOException { Path path = orcSplit.getPath(); Path root; if (orcSplit.hasBase()) { if (orcSplit.isOriginal()) { - root = path.getParent(); + root = orcSplit.getRootDir(); } else { root = path.getParent().getParent(); } @@ -216,6 +322,31 @@ public static boolean canCreateVectorizedAcidRowBatchReaderOnSplit(JobConf conf, return AcidUtils.deserializeDeleteDeltas(root, orcSplit.getDeltas()); } + /** + * There are 2 types of schema from the {@link #baseReader} that this handles. In the case + * the data was written to a transactional table from the start, every row is decorated with + * transaction related info and looks like >. + * + * The other case is when data was written to non-transactional table and thus only has the user + * data: . Then this table was then converted to a transactional table but the data + * files are not changed until major compaction. These are the "original" files. + * + * In this case we may need to decorate the outgoing data with transactional column values at + * read time. (It's done somewhat out of band via VectorizedRowBatchCtx - ask Teddy Choi). + * The "otid, writerId, rowid" columns represent {@link RecordIdentifier}. They are assigned + * each time the table is read in a way that needs to proeject {@link VirtualColumn#ROWID}. + * Major compaction will attach these values to each row permanently. + * It's critical that these generated column values are assigned exactly the same way by each + * read of the same row and by the Compactor. + * See {@link org.apache.hadoop.hive.ql.txn.compactor.CompactorMR} and + * {@link OrcRawRecordMerger.OriginalReaderPairToCompact} for the Compactor read path. + * (Longer term should make compactor use this class) + * + * This only decorates original rows with metadata if something above is requesting these values + * or if there are Delete events to apply. + * + * @return false where there is no more data, i.e. {@code value} is empty + */ @Override public boolean next(NullWritable key, VectorizedRowBatch value) throws IOException { try { @@ -257,12 +388,60 @@ public boolean next(NullWritable key, VectorizedRowBatch value) throws IOExcepti // When selectedInUse is set to false, everything in the batch is selected. selectedBitSet.set(0, vectorizedRowBatchBase.size, true); } - - // Case 1- find rows which belong to transactions that are not valid. - findRecordsWithInvalidTransactionIds(vectorizedRowBatchBase, selectedBitSet); + ColumnVector[] innerRecordIdColumnVector = vectorizedRowBatchBase.cols; + if(isOriginal) { + /* + * If there are deletes and reading original file, we must produce synthetic ROW_IDs in order + * to see if any deletes apply + */ + if(rowIdProjected || !deleteEventRegistry.isEmpty()) { + if(innerReader == null) { + throw new IllegalStateException(getClass().getName() + " requires " + + org.apache.orc.RecordReader.class + + " to handle original files that require ROW__IDs: " + rootPath); + } + /** + * {@link RecordIdentifier#getTransactionId()} + */ + recordIdColumnVector.fields[0].noNulls = true; + recordIdColumnVector.fields[0].isRepeating = true; + //all "original" is considered written by txnid:0 which committed + ((LongColumnVector)recordIdColumnVector.fields[0]).vector[0] = 0; + /** + * This is {@link RecordIdentifier#getBucketProperty()} + * Also see {@link BucketCodec} + */ + recordIdColumnVector.fields[1].noNulls = true; + recordIdColumnVector.fields[1].isRepeating = true; + ((LongColumnVector)recordIdColumnVector.fields[1]).vector[0] = syntheticProps.bucketProperty; + /** + * {@link RecordIdentifier#getRowId()} + */ + recordIdColumnVector.fields[2].noNulls = true; + recordIdColumnVector.fields[2].isRepeating = false; + long[] rowIdVector = ((LongColumnVector)recordIdColumnVector.fields[2]).vector; + for(int i = 0; i < vectorizedRowBatchBase.size; i++) { + //baseReader.getRowNumber() seems to point at the start of the batch todo: validate + rowIdVector[i] = syntheticProps.rowIdOffset + innerReader.getRowNumber() + i; + } + //Now populate a structure to use to apply delete events + innerRecordIdColumnVector = new ColumnVector[OrcRecordUpdater.FIELDS]; + innerRecordIdColumnVector[OrcRecordUpdater.ORIGINAL_TRANSACTION] = recordIdColumnVector.fields[0]; + innerRecordIdColumnVector[OrcRecordUpdater.BUCKET] = recordIdColumnVector.fields[1]; + innerRecordIdColumnVector[OrcRecordUpdater.ROW_ID] = recordIdColumnVector.fields[2]; + } + } + else { + // Case 1- find rows which belong to transactions that are not valid. + findRecordsWithInvalidTransactionIds(vectorizedRowBatchBase, selectedBitSet); + /** + * All "original" data belongs to txnid:0 and is always valid/committed for every reader + * So do not need to filter wrt {@link validTxnList} + */ + } // Case 2- find rows which have been deleted. - this.deleteEventRegistry.findDeletedRecords(vectorizedRowBatchBase.cols, + this.deleteEventRegistry.findDeletedRecords(innerRecordIdColumnVector, vectorizedRowBatchBase.size, selectedBitSet); if (selectedBitSet.cardinality() == vectorizedRowBatchBase.size) { @@ -283,30 +462,39 @@ public boolean next(NullWritable key, VectorizedRowBatch value) throws IOExcepti } } - // Finally, link up the columnVector from the base VectorizedRowBatch to outgoing batch. - // NOTE: We only link up the user columns and not the ACID metadata columns because this - // vectorized code path is not being used in cases of update/delete, when the metadata columns - // would be expected to be passed up the operator pipeline. This is because - // currently the update/delete specifically disable vectorized code paths. - // This happens at ql/exec/Utilities.java::3293 when it checks for mapWork.getVectorMode() - StructColumnVector payloadStruct = (StructColumnVector) vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW]; - // Transfer columnVector objects from base batch to outgoing batch. - System.arraycopy(payloadStruct.fields, 0, value.cols, 0, value.getDataColumnCount()); - if (rbCtx != null) { - recordIdColumnVector.fields[0] = vectorizedRowBatchBase.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]; - recordIdColumnVector.fields[1] = vectorizedRowBatchBase.cols[OrcRecordUpdater.BUCKET]; - recordIdColumnVector.fields[2] = vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW_ID]; + if(isOriginal) { + /*Just copy the payload. {@link recordIdColumnVector} has already been populated*/ + System.arraycopy(vectorizedRowBatchBase.cols, 0, value.cols, 0, + value.getDataColumnCount()); + } + else { + // Finally, link up the columnVector from the base VectorizedRowBatch to outgoing batch. + // NOTE: We only link up the user columns and not the ACID metadata columns because this + // vectorized code path is not being used in cases of update/delete, when the metadata columns + // would be expected to be passed up the operator pipeline. This is because + // currently the update/delete specifically disable vectorized code paths. + // This happens at ql/exec/Utilities.java::3293 when it checks for mapWork.getVectorMode() + StructColumnVector payloadStruct = (StructColumnVector) vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW]; + // Transfer columnVector objects from base batch to outgoing batch. + System.arraycopy(payloadStruct.fields, 0, value.cols, 0, value.getDataColumnCount()); + if(rowIdProjected) { + recordIdColumnVector.fields[0] = vectorizedRowBatchBase.cols[OrcRecordUpdater.ORIGINAL_TRANSACTION]; + recordIdColumnVector.fields[1] = vectorizedRowBatchBase.cols[OrcRecordUpdater.BUCKET]; + recordIdColumnVector.fields[2] = vectorizedRowBatchBase.cols[OrcRecordUpdater.ROW_ID]; + } + } + if(rowIdProjected) { rbCtx.setRecordIdColumnVector(recordIdColumnVector); } progress = baseReader.getProgress(); return true; } - protected void findRecordsWithInvalidTransactionIds(VectorizedRowBatch batch, BitSet selectedBitSet) { + private void findRecordsWithInvalidTransactionIds(VectorizedRowBatch batch, BitSet selectedBitSet) { findRecordsWithInvalidTransactionIds(batch.cols, batch.size, selectedBitSet); } - protected void findRecordsWithInvalidTransactionIds(ColumnVector[] cols, int size, BitSet selectedBitSet) { + private void findRecordsWithInvalidTransactionIds(ColumnVector[] cols, int size, BitSet selectedBitSet) { if (cols[OrcRecordUpdater.CURRENT_TRANSACTION].isRepeating) { // When we have repeating values, we can unset the whole bitset at once // if the repeating value is not a valid transaction. @@ -387,6 +575,11 @@ DeleteEventRegistry getDeleteEventRegistry() { * @throws IOException */ public void close() throws IOException; + + /** + * @return {@code true} if no delete events were found + */ + boolean isEmpty(); } /** @@ -400,10 +593,10 @@ DeleteEventRegistry getDeleteEventRegistry() { private OrcRawRecordMerger deleteRecords; private OrcRawRecordMerger.ReaderKey deleteRecordKey; private OrcStruct deleteRecordValue; - private boolean isDeleteRecordAvailable = true; + private Boolean isDeleteRecordAvailable = null; private ValidTxnList validTxnList; - public SortMergedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit, Reader.Options readerOptions) + SortMergedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit, Reader.Options readerOptions) throws IOException { final Path[] deleteDeltas = getDeleteDeltaDirsFromSplit(orcSplit); if (deleteDeltas.length > 0) { @@ -428,6 +621,13 @@ public SortMergedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit, Reader.Opt } @Override + public boolean isEmpty() { + if(isDeleteRecordAvailable == null) { + throw new IllegalStateException("Not yet initialized"); + } + return !isDeleteRecordAvailable; + } + @Override public void findDeletedRecords(ColumnVector[] cols, int size, BitSet selectedBitSet) throws IOException { if (!isDeleteRecordAvailable) { @@ -546,7 +746,7 @@ public void close() throws IOException { */ private int bucketProperty; private long rowId; - public DeleteRecordKey() { + DeleteRecordKey() { this.originalTransactionId = -1; this.rowId = -1; } @@ -596,7 +796,7 @@ public String toString() { private boolean isBucketPropertyRepeating; private final boolean isBucketedTable; - public DeleteReaderValue(Reader deleteDeltaReader, Reader.Options readerOptions, int bucket, + DeleteReaderValue(Reader deleteDeltaReader, Reader.Options readerOptions, int bucket, ValidTxnList validTxnList, boolean isBucketedTable) throws IOException { this.recordReader = deleteDeltaReader.rowsOptions(readerOptions); this.bucketForSplit = bucket; @@ -741,8 +941,9 @@ public int compareTo(CompressedOtid other) { private long rowIds[]; private CompressedOtid compressedOtids[]; private ValidTxnList validTxnList; + private Boolean isEmpty = null; - public ColumnizedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit, + ColumnizedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit, Reader.Options readerOptions) throws IOException, DeleteEventsOverflowMemoryException { int bucket = AcidUtils.parseBaseOrDeltaBucketFilename(orcSplit.getPath(), conf).getBucketId(); String txnString = conf.get(ValidTxnList.VALID_TXNS_KEY); @@ -804,6 +1005,7 @@ public ColumnizedDeleteEventRegistry(JobConf conf, OrcSplit orcSplit, readAllDeleteEventsFromDeleteDeltas(); } } + isEmpty = compressedOtids == null || rowIds == null; } catch(IOException|DeleteEventsOverflowMemoryException e) { close(); // close any open readers, if there was some exception during initialization. throw e; // rethrow the exception so that the caller can handle. @@ -910,7 +1112,13 @@ private boolean isDeleted(long otid, int bucketProperty, long rowId) { } return false; } - + @Override + public boolean isEmpty() { + if(isEmpty == null) { + throw new IllegalStateException("Not yet initialized"); + } + return isEmpty; + } @Override public void findDeletedRecords(ColumnVector[] cols, int size, BitSet selectedBitSet) throws IOException { diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java index 885ef83381..90403e1b8a 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcAcidRowReader.java @@ -40,7 +40,12 @@ * support tables and partitions stored in the ACID format. It works by using * the non-vectorized ACID reader and moving the data into a vectorized row * batch. + * + * Is there a reason to still have this when we have VectorizedOrcAcidRowBatchReader? + * Once VectorizedOrcAcidRowBatchReader handles isOriginal, there is really no reason + * to have this, right? */ +@Deprecated public class VectorizedOrcAcidRowReader implements org.apache.hadoop.mapred.RecordReader { diff --git ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java index 39d6b2b414..ced0325e95 100644 --- ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java +++ ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands.java @@ -17,7 +17,6 @@ */ package org.apache.hadoop.hive.ql; -import org.apache.commons.io.FileUtils; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.GetOpenTxnsInfoResponse; @@ -37,7 +36,6 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; import org.apache.hadoop.hive.ql.txn.AcidHouseKeeperService; -import org.junit.After; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; @@ -56,9 +54,9 @@ * test AC=true, and AC=false with commit/rollback/exception and test resulting data. * * Can also test, calling commit in AC=true mode, etc, toggling AC... - * - * Tests here are for multi-statement transactions (WIP) and those that don't need to - * run with Acid 2.0 (see subclasses of TestTxnCommands2) + * + * Tests here are for multi-statement transactions (WIP) and others + * Mostly uses bucketed tables */ public class TestTxnCommands extends TxnCommandsBaseForTests { static final private Logger LOG = LoggerFactory.getLogger(TestTxnCommands.class); diff --git ql/src/test/org/apache/hadoop/hive/ql/TestTxnNoBuckets.java ql/src/test/org/apache/hadoop/hive/ql/TestTxnNoBuckets.java index c827dc4a0e..e310b9d025 100644 --- ql/src/test/org/apache/hadoop/hive/ql/TestTxnNoBuckets.java +++ ql/src/test/org/apache/hadoop/hive/ql/TestTxnNoBuckets.java @@ -23,6 +23,7 @@ import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse; import org.junit.Assert; import org.junit.Before; +import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; @@ -522,5 +523,88 @@ public void testCtasBucketed() throws Exception { // Assert.assertEquals("Wrong msg", ErrorMsg.CTAS_PARCOL_COEXISTENCE.getErrorCode(), cpr.getErrorCode()); Assert.assertTrue(cpr.getErrorMessage().contains("CREATE-TABLE-AS-SELECT does not support")); } + @Test + public void testVectorizedWithDelete() throws Exception { + hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); + hiveConf.setVar(HiveConf.ConfVars.HIVEFETCHTASKCONVERSION, "none"); + //this enables vectorization of ROW__ID + hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ROW_IDENTIFIER_ENABLED, true);//HIVE-12631 + runStatementOnDriver("drop table if exists T"); + runStatementOnDriver("create table T(a int, b int) stored as orc tblproperties('transactional'='true')"); + runStatementOnDriver("insert into T(a,b) values(1,2),(3,4)"); + runStatementOnDriver("delete from T where b = 4"); + List rs = runStatementOnDriver("select a, b from T"); + Assert.assertEquals(1, rs.size()); + } + /** + * maybe there is no issue surfacing this if we only have 1 vectroized reader for acid... + * need to handle case with delete events for this*/ + @Test + public void testNonAcidToAcidVectorzied() throws Exception { + hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); + hiveConf.setVar(HiveConf.ConfVars.HIVEFETCHTASKCONVERSION, "none"); + //this enables vectorization of ROW__ID + hiveConf.setBoolVar(HiveConf.ConfVars.HIVE_VECTORIZATION_ROW_IDENTIFIER_ENABLED, true);//HIVE-12631 + runStatementOnDriver("drop table if exists T"); + runStatementOnDriver("create table T(a int, b int) stored as orc"); + int[][] values = {{1,2},{2,4},{5,6},{6,8},{9,10}}; + runStatementOnDriver("insert into T(a, b) " + makeValuesClause(values)); + //, 'transactional_properties'='default' + runStatementOnDriver("alter table T SET TBLPROPERTIES ('transactional'='true')"); + //this uses VectorizedOrcAcidRowBatchReader + List rs = runStatementOnDriver("select a from T where b > 6 order by a"); + String[][] expected = { + {"6", ""}, + {"9", ""}, + }; + checkExpected(rs, expected, "After conversion"); + Assert.assertEquals(Integer.toString(6), rs.get(0)); + Assert.assertEquals(Integer.toString(9), rs.get(1)); + + //why isn't PPD working.... - it is working but storage layer doesn't do row level filtering; only row group level + //this uses VectorizedOrcAcidRowBatchReader + rs = runStatementOnDriver("select ROW__ID, a from T where b > 6 order by a"); + String[][] expected1 = { + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":3}", "6"}, + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":4}", "9"} + }; + checkExpected(rs, expected1, "After conversion with VC1"); + + //this uses VectorizedOrcAcidRowBatchReader + rs = runStatementOnDriver("select ROW__ID, a from T where b > 0 order by a"); + String[][] expected2 = { + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":0}", "1"}, + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":1}", "2"}, + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":2}", "5"}, + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":3}", "6"}, + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":4}", "9"} + }; + checkExpected(rs, expected2, "After conversion with VC2"); + + //doesn't vectorize (uses neither of the Vectorzied Acid readers) + rs = runStatementOnDriver("select ROW__ID, a, INPUT__FILE__NAME from T where b > 6 order by a"); + Assert.assertEquals("", 2, rs.size()); + String[][] expected3 = { + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":3}\t6", "warehouse/t/000000_0"}, + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":4}\t9", "warehouse/t/000000_0"} + }; + checkExpected(rs, expected3, "After non-vectorized read"); + Assert.assertEquals(0, BucketCodec.determineVersion(536870912).decodeWriterId(536870912)); + + runStatementOnDriver("update T set b = 17 where a = 1"); + //this should use VectorizedOrcAcidRowReader + rs = runStatementOnDriver("select ROW__ID, b from T where b > 0 order by a"); + String[][] expected4 = { + {"{\"transactionid\":21,\"bucketid\":536870912,\"rowid\":0}","17"}, + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":1}","4"}, + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":2}","6"}, + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":3}","8"}, + {"{\"transactionid\":0,\"bucketid\":536870912,\"rowid\":4}","10"} + }; + checkExpected(rs, expected4, "After conversion with VC4"); + + //this should not vectorize at all + rs = runStatementOnDriver("select ROW__ID, INPUT__FILE__NAME, b from T where b > 0 order by a"); + } } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedOrcAcidRowBatchReader.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedOrcAcidRowBatchReader.java index 43e0a4a431..1c42640c73 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedOrcAcidRowBatchReader.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedOrcAcidRowBatchReader.java @@ -31,6 +31,7 @@ import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.AcidOutputFormat; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.BucketCodec; @@ -216,7 +217,7 @@ private void testVectorizedOrcAcidRowBatchReader(String deleteEventRegistry) thr // are being handled properly. conf.set(ValidTxnList.VALID_TXNS_KEY, "14:1:1:5"); // Exclude transaction 5 - VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL); + VectorizedOrcAcidRowBatchReader vectorizedReader = new VectorizedOrcAcidRowBatchReader(splits.get(0), conf, Reporter.NULL, new VectorizedRowBatchCtx()); if (deleteEventRegistry.equals(ColumnizedDeleteEventRegistry.class.getName())) { assertTrue(vectorizedReader.getDeleteEventRegistry() instanceof ColumnizedDeleteEventRegistry); } @@ -242,20 +243,4 @@ private void testVectorizedOrcAcidRowBatchReader(String deleteEventRegistry) thr } } } - - @Test - public void testCanCreateVectorizedAcidRowBatchReaderOnSplit() throws Exception { - OrcSplit mockSplit = Mockito.mock(OrcSplit.class); - - conf.setInt(HiveConf.ConfVars.HIVE_TXN_OPERATIONAL_PROPERTIES.varname, - AcidUtils.AcidOperationalProperties.getDefault().toInt()); - Mockito.when(mockSplit.isOriginal()).thenReturn(true); - // Test false when trying to create a vectorized ACID row batch reader when reading originals. - assertFalse(VectorizedOrcAcidRowBatchReader.canCreateVectorizedAcidRowBatchReaderOnSplit(conf, mockSplit)); - - // A positive test case. - Mockito.when(mockSplit.isOriginal()).thenReturn(false); - assertTrue(VectorizedOrcAcidRowBatchReader.canCreateVectorizedAcidRowBatchReaderOnSplit(conf, mockSplit)); - } - } diff --git ql/src/test/queries/clientpositive/acid_vectorization_original.q ql/src/test/queries/clientpositive/acid_vectorization_original.q new file mode 100644 index 0000000000..02b8b68e0f --- /dev/null +++ ql/src/test/queries/clientpositive/acid_vectorization_original.q @@ -0,0 +1,92 @@ +set hive.mapred.mode=nonstrict; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.vectorized.execution.enabled=true; + +-- enable ppd +set hive.optimize.index.filter=true; + +set hive.explain.user=false; + +-- WHY? seems needed for TestCliDriver but not TestMiniTezCliDriver +-- set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; + + +CREATE TABLE over10k(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE; + +--oddly this has 9999 rows +LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE over10k; + +-- 124|336|65664|4294967435|74.72|42.47|true|bob davidson|2013-03-01 09:11:58.703302|45.40|yard duty +-- 19|442|65553|4294967380|26.43|37.77|true|alice zipper|2013-03-01 09:11:58.703217|29.62|history +-- 35|387|65619|4294967459|96.91|18.86|false|katie davidson|2013-03-01 09:11:58.703079|27.32|history +-- 111|372|65656|4294967312|13.01|34.95|false|xavier quirinius|2013-03-01 09:11:58.703310|23.91|topology + + +CREATE TABLE over10k_orc_bucketed(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) CLUSTERED BY(si) INTO 4 BUCKETS STORED AS ORC; + +-- this produces about 250 distinct values across all 4 equivalence classes +select distinct si, si%4 from over10k; +-- explain insert into over10k_orc_bucketed select * from over10k cluster by si; +-- w/o "cluster by" all data is written to 000000_0 + insert into over10k_orc_bucketed select * from over10k cluster by si; + +dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/over10k_orc_bucketed; +-- create copy_N files +insert into over10k_orc_bucketed select * from over10k cluster by si; + +-- this doesn't produce any output in .out - it is visible in .orig +dfs -ls ${hiveconf:hive.metastore.warehouse.dir}/over10k_orc_bucketed; + +--this actually shows the data files in the .out +select distinct 7 as seven, INPUT__FILE__NAME from over10k_orc_bucketed; + +alter table over10k_orc_bucketed set TBLPROPERTIES ('transactional'='true'); + +-- this should vectorize (and push predicate to storage: filterExpr in TableScan ) +explain select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i; +select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i; + +-- this should vectorize (and push predicate to storage: filterExpr in TableScan ) +explain select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID; +select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID; + +-- this should vectorize (and push predicate to storage: filterExpr in TableScan ) +explain update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100; +update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100; + +-- this should produce the same result (data) as previous time this exact query ran +-- ROW__ID will be different (same bucketProperty) +select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID; + +-- this test that there are no duplicate ROW__IDs so should produce no output +select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1; + +-- can I look at delete delta content? I can check the data - this would imply delete matched +-- I can also select INPUT_FILE... +alter table over10k_orc_bucketed compact 'major'; +-- damn it - we need to run compaction here.... create UDF to runWorker? diff --git ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out new file mode 100644 index 0000000000..aeb59a733d --- /dev/null +++ ql/src/test/results/clientpositive/llap/acid_vectorization_original.q.out @@ -0,0 +1,636 @@ +PREHOOK: query: CREATE TABLE over10k(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@over10k +POSTHOOK: query: CREATE TABLE over10k(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@over10k +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE over10k +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@over10k +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE over10k +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@over10k +PREHOOK: query: CREATE TABLE over10k_orc_bucketed(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) CLUSTERED BY(si) INTO 4 BUCKETS STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: query: CREATE TABLE over10k_orc_bucketed(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) CLUSTERED BY(si) INTO 4 BUCKETS STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@over10k_orc_bucketed +PREHOOK: query: select distinct si, si%4 from over10k +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k +#### A masked pattern was here #### +POSTHOOK: query: select distinct si, si%4 from over10k +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k +#### A masked pattern was here #### +265 1 +268 0 +270 2 +271 3 +276 0 +279 3 +281 1 +287 3 +290 2 +291 3 +294 2 +300 0 +302 2 +304 0 +305 1 +310 2 +312 0 +313 1 +316 0 +325 1 +332 0 +340 0 +351 3 +358 2 +362 2 +363 3 +371 3 +373 1 +374 2 +376 0 +379 3 +383 3 +388 0 +392 0 +393 1 +395 3 +396 0 +399 3 +406 2 +408 0 +413 1 +418 2 +427 3 +428 0 +435 3 +441 1 +442 2 +444 0 +452 0 +454 2 +459 3 +460 0 +465 1 +467 3 +484 0 +486 2 +488 0 +490 2 +492 0 +495 3 +502 2 +504 0 +505 1 +510 2 +256 0 +266 2 +272 0 +278 2 +280 0 +282 2 +289 1 +295 3 +301 1 +321 1 +324 0 +329 1 +334 2 +335 3 +336 0 +337 1 +343 3 +344 0 +347 3 +348 0 +349 1 +353 1 +357 1 +365 1 +368 0 +377 1 +382 2 +384 0 +386 2 +394 2 +397 1 +398 2 +400 0 +401 1 +402 2 +404 0 +409 1 +417 1 +419 3 +424 0 +430 2 +434 2 +436 0 +437 1 +439 3 +440 0 +446 2 +461 1 +468 0 +472 0 +480 0 +496 0 +497 1 +507 3 +511 3 +NULL NULL +259 3 +260 0 +261 1 +262 2 +275 3 +277 1 +283 3 +284 0 +285 1 +292 0 +299 3 +309 1 +314 2 +315 3 +320 0 +323 3 +330 2 +331 3 +333 1 +339 3 +341 1 +350 2 +360 0 +367 3 +381 1 +385 1 +405 1 +410 2 +414 2 +422 2 +423 3 +425 1 +431 3 +432 0 +433 1 +438 2 +443 3 +445 1 +447 3 +449 1 +450 2 +451 3 +453 1 +457 1 +462 2 +463 3 +475 3 +476 0 +478 2 +482 2 +483 3 +487 3 +494 2 +498 2 +499 3 +503 3 +508 0 +257 1 +258 2 +263 3 +264 0 +267 3 +269 1 +273 1 +274 2 +286 2 +288 0 +293 1 +296 0 +297 1 +298 2 +303 3 +306 2 +307 3 +308 0 +311 3 +317 1 +318 2 +319 3 +322 2 +326 2 +327 3 +328 0 +338 2 +342 2 +345 1 +346 2 +352 0 +354 2 +355 3 +356 0 +359 3 +361 1 +364 0 +366 2 +370 2 +372 0 +375 3 +378 2 +380 0 +387 3 +389 1 +390 2 +391 3 +403 3 +407 3 +411 3 +415 3 +420 0 +421 1 +426 2 +429 1 +448 0 +455 3 +456 0 +458 2 +464 0 +466 2 +469 1 +471 3 +473 1 +474 2 +477 1 +479 3 +481 1 +485 1 +489 1 +491 3 +493 1 +500 0 +501 1 +506 2 +509 1 +PREHOOK: query: insert into over10k_orc_bucketed select * from over10k cluster by si +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k +PREHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: query: insert into over10k_orc_bucketed select * from over10k cluster by si +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k +POSTHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: Lineage: over10k_orc_bucketed.b SIMPLE [(over10k)over10k.FieldSchema(name:b, type:bigint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.bin SIMPLE [(over10k)over10k.FieldSchema(name:bin, type:binary, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.bo SIMPLE [(over10k)over10k.FieldSchema(name:bo, type:boolean, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.d SIMPLE [(over10k)over10k.FieldSchema(name:d, type:double, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.dec SIMPLE [(over10k)over10k.FieldSchema(name:dec, type:decimal(4,2), comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.f SIMPLE [(over10k)over10k.FieldSchema(name:f, type:float, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.i SIMPLE [(over10k)over10k.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.s SIMPLE [(over10k)over10k.FieldSchema(name:s, type:string, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.si SIMPLE [(over10k)over10k.FieldSchema(name:si, type:smallint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.t SIMPLE [(over10k)over10k.FieldSchema(name:t, type:tinyint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.ts SIMPLE [(over10k)over10k.FieldSchema(name:ts, type:timestamp, comment:null), ] +Found 4 items +#### A masked pattern was here #### +PREHOOK: query: insert into over10k_orc_bucketed select * from over10k cluster by si +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k +PREHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: query: insert into over10k_orc_bucketed select * from over10k cluster by si +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k +POSTHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: Lineage: over10k_orc_bucketed.b SIMPLE [(over10k)over10k.FieldSchema(name:b, type:bigint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.bin SIMPLE [(over10k)over10k.FieldSchema(name:bin, type:binary, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.bo SIMPLE [(over10k)over10k.FieldSchema(name:bo, type:boolean, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.d SIMPLE [(over10k)over10k.FieldSchema(name:d, type:double, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.dec SIMPLE [(over10k)over10k.FieldSchema(name:dec, type:decimal(4,2), comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.f SIMPLE [(over10k)over10k.FieldSchema(name:f, type:float, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.i SIMPLE [(over10k)over10k.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.s SIMPLE [(over10k)over10k.FieldSchema(name:s, type:string, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.si SIMPLE [(over10k)over10k.FieldSchema(name:si, type:smallint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.t SIMPLE [(over10k)over10k.FieldSchema(name:t, type:tinyint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.ts SIMPLE [(over10k)over10k.FieldSchema(name:ts, type:timestamp, comment:null), ] +Found 8 items +#### A masked pattern was here #### +PREHOOK: query: select distinct 7 as seven, INPUT__FILE__NAME from over10k_orc_bucketed +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +POSTHOOK: query: select distinct 7 as seven, INPUT__FILE__NAME from over10k_orc_bucketed +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +PREHOOK: query: alter table over10k_orc_bucketed set TBLPROPERTIES ('transactional'='true') +PREHOOK: type: ALTERTABLE_PROPERTIES +PREHOOK: Input: default@over10k_orc_bucketed +PREHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: query: alter table over10k_orc_bucketed set TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: ALTERTABLE_PROPERTIES +POSTHOOK: Input: default@over10k_orc_bucketed +POSTHOOK: Output: default@over10k_orc_bucketed +PREHOOK: query: explain select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i +PREHOOK: type: QUERY +POSTHOOK: query: explain select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: over10k_orc_bucketed + filterExpr: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 2098 Data size: 39900 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: t (type: tinyint), si (type: smallint), i (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint), _col1 (type: smallint), _col2 (type: int) + sort order: +++ + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: tinyint), KEY.reducesinkkey1 (type: smallint), KEY.reducesinkkey2 (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +POSTHOOK: query: select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +-3 344 65733 +-3 344 65733 +5 501 65585 +5 501 65585 +35 463 65646 +35 463 65646 +PREHOOK: query: explain select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +PREHOOK: type: QUERY +POSTHOOK: query: explain select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: over10k_orc_bucketed + filterExpr: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 2098 Data size: 39900 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ROW__ID (type: struct), t (type: tinyint), si (type: smallint), i (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: struct) + sort order: + + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: tinyint), _col2 (type: smallint), _col3 (type: int) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: struct), VALUE._col0 (type: tinyint), VALUE._col1 (type: smallint), VALUE._col2 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +POSTHOOK: query: select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +{"transactionid":0,"bucketid":536870912,"rowid":104} -3 344 65733 +{"transactionid":0,"bucketid":536870912,"rowid":368} -3 344 65733 +{"transactionid":0,"bucketid":536936448,"rowid":250} 5 501 65585 +{"transactionid":0,"bucketid":536936448,"rowid":512} 5 501 65585 +{"transactionid":0,"bucketid":537067520,"rowid":224} 35 463 65646 +{"transactionid":0,"bucketid":537067520,"rowid":501} 35 463 65646 +PREHOOK: query: explain update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100 +PREHOOK: type: QUERY +POSTHOOK: query: explain update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: over10k_orc_bucketed + filterExpr: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 2098 Data size: 1021440 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 1 Data size: 486 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ROW__ID (type: struct), t (type: tinyint), si (type: smallint), f (type: float), d (type: double), bo (type: boolean), s (type: string), ts (type: timestamp), dec (type: decimal(4,2)), bin (type: binary) + outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1 Data size: 486 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: struct) + sort order: + + Map-reduce partition columns: UDFToInteger(_col0) (type: int) + Statistics: Num rows: 1 Data size: 486 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: tinyint), _col2 (type: smallint), _col5 (type: float), _col6 (type: double), _col7 (type: boolean), _col8 (type: string), _col9 (type: timestamp), _col10 (type: decimal(4,2)), _col11 (type: binary) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: struct), VALUE._col0 (type: tinyint), VALUE._col1 (type: smallint), 0 (type: int), 4294967363 (type: bigint), VALUE._col3 (type: float), VALUE._col4 (type: double), VALUE._col5 (type: boolean), VALUE._col6 (type: string), VALUE._col7 (type: timestamp), VALUE._col8 (type: decimal(4,2)), VALUE._col9 (type: binary) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1 Data size: 486 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 486 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.over10k_orc_bucketed + Write Type: UPDATE + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: false + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.over10k_orc_bucketed + Write Type: UPDATE + + Stage: Stage-3 + Stats-Aggr Operator + +PREHOOK: query: update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +PREHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: query: update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +POSTHOOK: Output: default@over10k_orc_bucketed +PREHOOK: query: select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +POSTHOOK: query: select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +{"transactionid":14,"bucketid":536870912,"rowid":0} -3 344 0 +{"transactionid":14,"bucketid":536870912,"rowid":1} -3 344 0 +{"transactionid":14,"bucketid":536936448,"rowid":0} 5 501 0 +{"transactionid":14,"bucketid":536936448,"rowid":1} 5 501 0 +{"transactionid":14,"bucketid":537067520,"rowid":0} 35 463 0 +{"transactionid":14,"bucketid":537067520,"rowid":1} 35 463 0 +PREHOOK: query: select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +POSTHOOK: query: select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +NULL 6 +PREHOOK: query: alter table over10k_orc_bucketed compact 'major' +PREHOOK: type: ALTERTABLE_COMPACT +POSTHOOK: query: alter table over10k_orc_bucketed compact 'major' +POSTHOOK: type: ALTERTABLE_COMPACT diff --git ql/src/test/results/clientpositive/tez/acid_vectorization_original.q.out ql/src/test/results/clientpositive/tez/acid_vectorization_original.q.out new file mode 100644 index 0000000000..6099f81b7b --- /dev/null +++ ql/src/test/results/clientpositive/tez/acid_vectorization_original.q.out @@ -0,0 +1,631 @@ +PREHOOK: query: CREATE TABLE over10k(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@over10k +POSTHOOK: query: CREATE TABLE over10k(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) +ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' +STORED AS TEXTFILE +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@over10k +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE over10k +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@over10k +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/over1k' OVERWRITE INTO TABLE over10k +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@over10k +PREHOOK: query: CREATE TABLE over10k_orc_bucketed(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) CLUSTERED BY(si) INTO 4 BUCKETS STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: query: CREATE TABLE over10k_orc_bucketed(t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + `dec` decimal(4,2), + bin binary) CLUSTERED BY(si) INTO 4 BUCKETS STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@over10k_orc_bucketed +PREHOOK: query: select distinct si, si%4 from over10k +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k +#### A masked pattern was here #### +POSTHOOK: query: select distinct si, si%4 from over10k +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k +#### A masked pattern was here #### +NULL NULL +256 0 +257 1 +258 2 +259 3 +260 0 +261 1 +262 2 +263 3 +264 0 +265 1 +266 2 +267 3 +268 0 +269 1 +270 2 +271 3 +272 0 +273 1 +274 2 +275 3 +276 0 +277 1 +278 2 +279 3 +280 0 +281 1 +282 2 +283 3 +284 0 +285 1 +286 2 +287 3 +288 0 +289 1 +290 2 +291 3 +292 0 +293 1 +294 2 +295 3 +296 0 +297 1 +298 2 +299 3 +300 0 +301 1 +302 2 +303 3 +304 0 +305 1 +306 2 +307 3 +308 0 +309 1 +310 2 +311 3 +312 0 +313 1 +314 2 +315 3 +316 0 +317 1 +318 2 +319 3 +320 0 +321 1 +322 2 +323 3 +324 0 +325 1 +326 2 +327 3 +328 0 +329 1 +330 2 +331 3 +332 0 +333 1 +334 2 +335 3 +336 0 +337 1 +338 2 +339 3 +340 0 +341 1 +342 2 +343 3 +344 0 +345 1 +346 2 +347 3 +348 0 +349 1 +350 2 +351 3 +352 0 +353 1 +354 2 +355 3 +356 0 +357 1 +358 2 +359 3 +360 0 +361 1 +362 2 +363 3 +364 0 +365 1 +366 2 +367 3 +368 0 +370 2 +371 3 +372 0 +373 1 +374 2 +375 3 +376 0 +377 1 +378 2 +379 3 +380 0 +381 1 +382 2 +383 3 +384 0 +385 1 +386 2 +387 3 +388 0 +389 1 +390 2 +391 3 +392 0 +393 1 +394 2 +395 3 +396 0 +397 1 +398 2 +399 3 +400 0 +401 1 +402 2 +403 3 +404 0 +405 1 +406 2 +407 3 +408 0 +409 1 +410 2 +411 3 +413 1 +414 2 +415 3 +417 1 +418 2 +419 3 +420 0 +421 1 +422 2 +423 3 +424 0 +425 1 +426 2 +427 3 +428 0 +429 1 +430 2 +431 3 +432 0 +433 1 +434 2 +435 3 +436 0 +437 1 +438 2 +439 3 +440 0 +441 1 +442 2 +443 3 +444 0 +445 1 +446 2 +447 3 +448 0 +449 1 +450 2 +451 3 +452 0 +453 1 +454 2 +455 3 +456 0 +457 1 +458 2 +459 3 +460 0 +461 1 +462 2 +463 3 +464 0 +465 1 +466 2 +467 3 +468 0 +469 1 +471 3 +472 0 +473 1 +474 2 +475 3 +476 0 +477 1 +478 2 +479 3 +480 0 +481 1 +482 2 +483 3 +484 0 +485 1 +486 2 +487 3 +488 0 +489 1 +490 2 +491 3 +492 0 +493 1 +494 2 +495 3 +496 0 +497 1 +498 2 +499 3 +500 0 +501 1 +502 2 +503 3 +504 0 +505 1 +506 2 +507 3 +508 0 +509 1 +510 2 +511 3 +PREHOOK: query: insert into over10k_orc_bucketed select * from over10k cluster by si +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k +PREHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: query: insert into over10k_orc_bucketed select * from over10k cluster by si +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k +POSTHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: Lineage: over10k_orc_bucketed.b SIMPLE [(over10k)over10k.FieldSchema(name:b, type:bigint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.bin SIMPLE [(over10k)over10k.FieldSchema(name:bin, type:binary, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.bo SIMPLE [(over10k)over10k.FieldSchema(name:bo, type:boolean, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.d SIMPLE [(over10k)over10k.FieldSchema(name:d, type:double, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.dec SIMPLE [(over10k)over10k.FieldSchema(name:dec, type:decimal(4,2), comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.f SIMPLE [(over10k)over10k.FieldSchema(name:f, type:float, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.i SIMPLE [(over10k)over10k.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.s SIMPLE [(over10k)over10k.FieldSchema(name:s, type:string, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.si SIMPLE [(over10k)over10k.FieldSchema(name:si, type:smallint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.t SIMPLE [(over10k)over10k.FieldSchema(name:t, type:tinyint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.ts SIMPLE [(over10k)over10k.FieldSchema(name:ts, type:timestamp, comment:null), ] +Found 4 items +#### A masked pattern was here #### +PREHOOK: query: insert into over10k_orc_bucketed select * from over10k cluster by si +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k +PREHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: query: insert into over10k_orc_bucketed select * from over10k cluster by si +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k +POSTHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: Lineage: over10k_orc_bucketed.b SIMPLE [(over10k)over10k.FieldSchema(name:b, type:bigint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.bin SIMPLE [(over10k)over10k.FieldSchema(name:bin, type:binary, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.bo SIMPLE [(over10k)over10k.FieldSchema(name:bo, type:boolean, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.d SIMPLE [(over10k)over10k.FieldSchema(name:d, type:double, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.dec SIMPLE [(over10k)over10k.FieldSchema(name:dec, type:decimal(4,2), comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.f SIMPLE [(over10k)over10k.FieldSchema(name:f, type:float, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.i SIMPLE [(over10k)over10k.FieldSchema(name:i, type:int, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.s SIMPLE [(over10k)over10k.FieldSchema(name:s, type:string, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.si SIMPLE [(over10k)over10k.FieldSchema(name:si, type:smallint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.t SIMPLE [(over10k)over10k.FieldSchema(name:t, type:tinyint, comment:null), ] +POSTHOOK: Lineage: over10k_orc_bucketed.ts SIMPLE [(over10k)over10k.FieldSchema(name:ts, type:timestamp, comment:null), ] +Found 8 items +#### A masked pattern was here #### +PREHOOK: query: select distinct 7 as seven, INPUT__FILE__NAME from over10k_orc_bucketed +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +POSTHOOK: query: select distinct 7 as seven, INPUT__FILE__NAME from over10k_orc_bucketed +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +PREHOOK: query: alter table over10k_orc_bucketed set TBLPROPERTIES ('transactional'='true') +PREHOOK: type: ALTERTABLE_PROPERTIES +PREHOOK: Input: default@over10k_orc_bucketed +PREHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: query: alter table over10k_orc_bucketed set TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: ALTERTABLE_PROPERTIES +POSTHOOK: Input: default@over10k_orc_bucketed +POSTHOOK: Output: default@over10k_orc_bucketed +PREHOOK: query: explain select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i +PREHOOK: type: QUERY +POSTHOOK: query: explain select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: over10k_orc_bucketed + filterExpr: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 2098 Data size: 39900 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: t (type: tinyint), si (type: smallint), i (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint), _col1 (type: smallint), _col2 (type: int) + sort order: +++ + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Execution mode: vectorized + Reducer 2 + Execution mode: vectorized + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: tinyint), KEY.reducesinkkey1 (type: smallint), KEY.reducesinkkey2 (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +POSTHOOK: query: select t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by t, si, i +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +-3 344 65733 +-3 344 65733 +5 501 65585 +5 501 65585 +35 463 65646 +35 463 65646 +PREHOOK: query: explain select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +PREHOOK: type: QUERY +POSTHOOK: query: explain select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: over10k_orc_bucketed + filterExpr: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 2098 Data size: 39900 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ROW__ID (type: struct), t (type: tinyint), si (type: smallint), i (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: struct) + sort order: + + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: tinyint), _col2 (type: smallint), _col3 (type: int) + Reducer 2 + Execution mode: vectorized + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: struct), VALUE._col0 (type: tinyint), VALUE._col1 (type: smallint), VALUE._col2 (type: int) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 19 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +POSTHOOK: query: select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +{"transactionid":0,"bucketid":536870912,"rowid":104} -3 344 65733 +{"transactionid":0,"bucketid":536870912,"rowid":368} -3 344 65733 +{"transactionid":0,"bucketid":536936448,"rowid":250} 5 501 65585 +{"transactionid":0,"bucketid":536936448,"rowid":512} 5 501 65585 +{"transactionid":0,"bucketid":537067520,"rowid":224} 35 463 65646 +{"transactionid":0,"bucketid":537067520,"rowid":501} 35 463 65646 +PREHOOK: query: explain update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100 +PREHOOK: type: QUERY +POSTHOOK: query: explain update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: over10k_orc_bucketed + filterExpr: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 2098 Data size: 1021440 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((b = 4294967363) and (t < 100)) (type: boolean) + Statistics: Num rows: 1 Data size: 486 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ROW__ID (type: struct), t (type: tinyint), si (type: smallint), f (type: float), d (type: double), bo (type: boolean), s (type: string), ts (type: timestamp), dec (type: decimal(4,2)), bin (type: binary) + outputColumnNames: _col0, _col1, _col2, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1 Data size: 486 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: struct) + sort order: + + Map-reduce partition columns: UDFToInteger(_col0) (type: int) + Statistics: Num rows: 1 Data size: 486 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: tinyint), _col2 (type: smallint), _col5 (type: float), _col6 (type: double), _col7 (type: boolean), _col8 (type: string), _col9 (type: timestamp), _col10 (type: decimal(4,2)), _col11 (type: binary) + Execution mode: vectorized + Reducer 2 + Execution mode: vectorized + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: struct), VALUE._col0 (type: tinyint), VALUE._col1 (type: smallint), 0 (type: int), 4294967363 (type: bigint), VALUE._col3 (type: float), VALUE._col4 (type: double), VALUE._col5 (type: boolean), VALUE._col6 (type: string), VALUE._col7 (type: timestamp), VALUE._col8 (type: decimal(4,2)), VALUE._col9 (type: binary) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11 + Statistics: Num rows: 1 Data size: 486 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 486 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.over10k_orc_bucketed + Write Type: UPDATE + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: false + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.over10k_orc_bucketed + Write Type: UPDATE + + Stage: Stage-3 + Stats-Aggr Operator + +PREHOOK: query: update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +PREHOOK: Output: default@over10k_orc_bucketed +POSTHOOK: query: update over10k_orc_bucketed set i = 0 where b = 4294967363 and t < 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +POSTHOOK: Output: default@over10k_orc_bucketed +PREHOOK: query: select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +POSTHOOK: query: select ROW__ID, t, si, i from over10k_orc_bucketed where b = 4294967363 and t < 100 order by ROW__ID +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +{"transactionid":14,"bucketid":536870912,"rowid":0} -3 344 0 +{"transactionid":14,"bucketid":536870912,"rowid":1} -3 344 0 +{"transactionid":14,"bucketid":536936448,"rowid":0} 5 501 0 +{"transactionid":14,"bucketid":536936448,"rowid":1} 5 501 0 +{"transactionid":14,"bucketid":537067520,"rowid":0} 35 463 0 +{"transactionid":14,"bucketid":537067520,"rowid":1} 35 463 0 +PREHOOK: query: select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +POSTHOOK: query: select ROW__ID, count(*) from over10k_orc_bucketed group by ROW__ID having count(*) > 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@over10k_orc_bucketed +#### A masked pattern was here #### +PREHOOK: query: alter table over10k_orc_bucketed compact 'major' +PREHOOK: type: ALTERTABLE_COMPACT +POSTHOOK: query: alter table over10k_orc_bucketed compact 'major' +POSTHOOK: type: ALTERTABLE_COMPACT