diff --git ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index cc840be..0c737b3 100644 --- ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -2090,6 +2090,9 @@ public static void copyTableJobPropertiesToConf(TableDesc tbl, JobConf job) { for (Map.Entry entry : jobProperties.entrySet()) { job.set(entry.getKey(), entry.getValue()); } + // copy the bucket count + job.set(hive_metastoreConstants.BUCKET_COUNT, + tbl.getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT)); } private static final Object INPUT_SUMMARY_LOCK = new Object(); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/AcidInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/AcidInputFormat.java new file mode 100644 index 0000000..7fb9ce2 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/AcidInputFormat.java @@ -0,0 +1,91 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; + +import java.io.IOException; + +public interface AcidInputFormat + extends InputFormat, InputFormatChecker { + + public static class Options { + private final Configuration conf; + private Reporter reporter; + + public Options(Configuration conf) { + this.conf = conf; + } + + public Options reporter(Reporter reporter) { + this.reporter = reporter; + return this; + } + + public Configuration getConfiguration() { + return conf; + } + + public Reporter getReporter() { + return reporter; + } + } + + public static interface RowReader + extends RecordReader { + public ObjectInspector getObjectInspector(); + } + + public RowReader getReader(InputSplit split, + Options options) throws IOException; + + public static interface RawReader + extends RecordReader { + public ObjectInspector getObjectInspector(); + } + + /** + * Get a reader that returns the raw ACID events (insert, update, delete). + * Should only be used by the compactor. + * @param conf the configuration + * @param collapseEvents should the ACID events be collapsed so that only + * the last version of the row is kept. + * @param bucket the bucket to read + * @param baseDirectory the base directory to read or the root directory for + * old style files + * @param deltaDirectory a list of delta files to include in the merge + * @return a record reader + * @throws IOException + */ + RawReader getRawReader(Configuration conf, + boolean collapseEvents, + int bucket, + Path baseDirectory, + Path... deltaDirectory + ) throws IOException; +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/AcidOutputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/AcidOutputFormat.java new file mode 100644 index 0000000..46a6811 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/AcidOutputFormat.java @@ -0,0 +1,182 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.RecordWriter; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.Reporter; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.Properties; + +public interface AcidOutputFormat extends HiveOutputFormat { + + public static class Options { + private final Configuration configuration; + private FileSystem fs; + private ObjectInspector inspector; + private boolean writingBase = false; + private boolean isCompressed = false; + private Properties properties; + private Reporter reporter; + private long minimumTransactionId; + private long maximumTransactionId; + private int bucket; + private PrintStream dummyStream = null; + private boolean oldStyle = false; + + public Options(Configuration conf) { + this.configuration = conf; + } + + public Options inspector(ObjectInspector inspector) { + this.inspector = inspector; + return this; + } + + public Options writingBase(boolean val) { + this.writingBase = val; + return this; + } + + public Options filesystem(FileSystem fs) { + this.fs = fs; + return this; + } + + public Options isCompressed(boolean isCompressed) { + this.isCompressed = isCompressed; + return this; + } + + public Options tableProperties(Properties properties) { + this.properties = properties; + return this; + } + + public Options reporter(Reporter reporter) { + this.reporter = reporter; + return this; + } + + public Options minimumTransactionId(long min) { + this.minimumTransactionId = min; + return this; + } + + public Options maximumTransactionId(long max) { + this.maximumTransactionId = max; + return this; + } + + public Options bucket(int bucket) { + this.bucket = bucket; + return this; + } + + Options setOldStyle(boolean value) { + oldStyle = value; + return this; + } + + /** + * Temporary switch while we are in development that replaces the + * implementation with a dummy one that just prints to stream. + * @param stream the stream to print to + * @return this + */ + public Options useDummy(PrintStream stream) { + this.dummyStream = stream; + return this; + } + + public Configuration getConfiguration() { + return configuration; + } + + public FileSystem getFilesystem() { + return fs; + } + + public ObjectInspector getInspector() { + return inspector; + } + + public boolean isCompressed() { + return isCompressed; + } + + public Properties getTableProperties() { + return properties; + } + + public Reporter getReporter() { + return reporter; + } + + public long getMinimumTransactionId() { + return minimumTransactionId; + } + + public long getMaximumTransactionId() { + return maximumTransactionId; + } + + public boolean isWritingBase() { + return writingBase; + } + + public int getBucket() { + return bucket; + } + + public PrintStream getDummyStream() { + return dummyStream; + } + + boolean getOldStyle() { + return oldStyle; + } + } + + /** + * Create a RecordUpdater for inserting, updating, or deleting records. + * @param path the partition directory name + * @param options the options for the writer + * @return the RecordUpdater for the output file + */ + public RecordUpdater getRecordUpdater(Path path, + Options options) throws IOException; + + /** + * Create a raw writer for ACID events. + * This is only intended for the compactor. + * @param path + * @param options + * @return + * @throws IOException + */ + public FSRecordWriter getRawRecordWriter(Path path, + Options options) throws IOException; +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java new file mode 100644 index 0000000..648062a --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java @@ -0,0 +1,392 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.shims.HadoopShims; +import org.apache.hadoop.hive.shims.ShimLoader; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.regex.Pattern; + +public class AcidUtils { + private AcidUtils() { + // NOT USED + } + private static final Log LOG = LogFactory.getLog(AcidUtils.class.getName()); + + public static final String BASE_PREFIX = "base_"; + public static final String DELTA_PREFIX = "delta_"; + public static final String BUCKET_PREFIX = "bucket_"; + + private static final String BUCKET_DIGITS = "%05d"; + private static final String DELTA_DIGITS = "%07d"; + + private static final Pattern ORIGINAL_PATTERN = + Pattern.compile("[0-9]+_[0-9]+"); + + public static final PathFilter hiddenFileFilter = new PathFilter(){ + public boolean accept(Path p){ + String name = p.getName(); + return !name.startsWith("_") && !name.startsWith("."); + } + }; + private static final HadoopShims SHIMS = ShimLoader.getHadoopShims(); + + /** + * Create the bucket filename. + * @param subdir the subdirectory for the bucket. + * @param bucket the bucket number + * @return the filename + */ + public static Path createBucketFile(Path subdir, int bucket) { + return new Path(subdir, + BUCKET_PREFIX + String.format(BUCKET_DIGITS, bucket)); + } + + private static String deltaSubdir(long min, long max) { + return DELTA_PREFIX + String.format(DELTA_DIGITS, min) + "_" + + String.format(DELTA_DIGITS, max); + } + + /** + * Create a filename for a bucket file. + * @param directory the partition directory + * @param options the options for writing the bucket + * @return the filename that should store the bucket + */ + public static Path createFilename(Path directory, + AcidOutputFormat.Options options) { + String subdir; + if (options.getOldStyle()) { + return new Path(directory, String.format(BUCKET_DIGITS, + options.getBucket()) + "_0"); + } else if (options.isWritingBase()) { + subdir = BASE_PREFIX + String.format(DELTA_DIGITS, + options.getMaximumTransactionId()); + } else { + subdir = deltaSubdir(options.getMinimumTransactionId(), + options.getMaximumTransactionId()); + } + return createBucketFile(new Path(directory, subdir), options.getBucket()); + } + + /** + * Get the transaction id from a base directory name. + * @param path the base directory name + * @return the maximum transaction id that is included + */ + static long parseBase(Path path) { + String filename = path.getName(); + if (filename.startsWith(BASE_PREFIX)) { + return Long.parseLong(filename.substring(BASE_PREFIX.length())); + } + throw new IllegalArgumentException(filename + " does not start with " + + BASE_PREFIX); + } + + /** + * Parse a bucket filename back into the options that would have created + * the file. + * @param bucketFile the path to a bucket file + * @param conf the configuration + * @return the options used to create that filename + */ + public static AcidOutputFormat.Options + parseBaseBucketFilename(Path bucketFile, + Configuration conf) { + AcidOutputFormat.Options result = new AcidOutputFormat.Options(conf); + String filename = bucketFile.getName(); + result.writingBase(true); + if (ORIGINAL_PATTERN.matcher(filename).matches()) { + int bucket = + Integer.parseInt(filename.substring(0, filename.indexOf('_'))); + result + .setOldStyle(true) + .minimumTransactionId(0) + .maximumTransactionId(0) + .bucket(bucket); + } else { + int bucket = + Integer.parseInt(filename.substring(filename.indexOf('_') + 1)); + result + .setOldStyle(false) + .minimumTransactionId(0) + .maximumTransactionId(parseBase(bucketFile.getParent())) + .bucket(bucket); + } + return result; + } + + public static interface Directory { + + /** + * Get the base directory. + * @return the base directory to read + */ + Path getBaseDirectory(); + + /** + * Get the list of original files. + * @return the list of original files (eg. 000000_0) + */ + List getOriginalFiles(); + + /** + * Get the list of base and delta directories that are valid and not + * obsolete. + * @return the minimal list of current directories + */ + List getCurrentDirectories(); + + /** + * Get the list of obsolete directories. After filtering out bases and + * deltas that are not selected by the valid transaction list, return the + * list of original files, bases, and deltas that have been replaced by + * more up to date ones. + */ + List getObsolete(); + } + + public static class ParsedDelta implements Comparable { + final long minTransaction; + final long maxTransaction; + final FileStatus path; + + ParsedDelta(long min, long max, FileStatus path) { + this.minTransaction = min; + this.maxTransaction = max; + this.path = path; + } + + public long getMinTransaction() { + return minTransaction; + } + + public long getMaxTransaction() { + return maxTransaction; + } + + public Path getPath() { + return path.getPath(); + } + + @Override + public int compareTo(ParsedDelta parsedDelta) { + if (minTransaction != parsedDelta.minTransaction) { + if (minTransaction < parsedDelta.minTransaction) { + return -1; + } else { + return 1; + } + } else if (maxTransaction != parsedDelta.maxTransaction) { + if (maxTransaction < parsedDelta.maxTransaction) { + return 1; + } else { + return -1; + } + } else { + return path.compareTo(parsedDelta.path); + } + } + } + + /** + * Convert the list of deltas into an equivalent list of begin/end + * transaction id pairs. + * @param deltas + * @return the list of transaction ids to serialize + */ + public static List serializeDeltas(List deltas) { + List result = new ArrayList(deltas.size() * 2); + for(ParsedDelta delta: deltas) { + result.add(delta.minTransaction); + result.add(delta.maxTransaction); + } + return result; + } + + /** + * Convert the list of begin/end transaction id pairs to a list of delta + * directories. + * @param root the root directory + * @param deltas list of begin/end transaction id pairs + * @return the list of delta paths + */ + public static Path[] deserializeDeltas(Path root, List deltas) { + int deltaSize = deltas.size() / 2; + Path[] result = new Path[deltaSize]; + for(int i = 0; i < deltaSize; ++i) { + result[i] = new Path(root, deltaSubdir(deltas.get(i * 2), + deltas.get(i * 2 + 1))); + } + return result; + } + + static ParsedDelta parseDelta(FileStatus path) { + String filename = path.getPath().getName(); + if (filename.startsWith(DELTA_PREFIX)) { + String rest = filename.substring(DELTA_PREFIX.length()); + int split = rest.indexOf('_'); + long min = Long.parseLong(rest.substring(0, split)); + long max = Long.parseLong(rest.substring(split + 1)); + return new ParsedDelta(min, max, path); + } + throw new IllegalArgumentException(path + " does not start with " + + DELTA_PREFIX); + } + + /** + * Get the ACID state of the given directory. It finds the minimal set of + * base and diff directories. Note that because major compactions don't + * preserve the history, we can't use a base directory that includes a + * transaction id that we must exclude. + * @param directory the partition directory to analyze + * @param conf the configuration + * @param txnList the list of transactions that we are reading + * @return the state of the directory + * @throws IOException + */ + public static Directory getAcidState(Path directory, + Configuration conf, + IMetaStoreClient.ValidTxnList txnList + ) throws IOException { + FileSystem fs = directory.getFileSystem(conf); + FileStatus bestBase = null; + long bestBaseTxn = 0; + final List deltas = new ArrayList(); + List working = new ArrayList(); + final List original = new ArrayList(); + final List obsolete = new ArrayList(); + Path ignoredBase = null; + + Iterator childIterator = + SHIMS.listLocatedStatus(fs, directory, hiddenFileFilter); + while (childIterator.hasNext()) { + FileStatus child = childIterator.next(); + Path p = child.getPath(); + String fn = p.getName(); + if (fn.startsWith(BASE_PREFIX) && child.isDir()) { + long txn = parseBase(p); + if (txnList.isTxnRangeCommitted(0, txn) != + IMetaStoreClient.ValidTxnList.RangeResponse.ALL) { + ignoredBase = p; + } else { + if (bestBase == null) { + bestBase = child; + bestBaseTxn = txn; + } else if (bestBaseTxn < txn) { + obsolete.add(bestBase); + bestBase = child; + bestBaseTxn = txn; + } else { + obsolete.add(child); + } + } + } else if (fn.startsWith(DELTA_PREFIX) && child.isDir()) { + ParsedDelta delta = parseDelta(child); + if (txnList.isTxnRangeCommitted(delta.minTransaction, + delta.maxTransaction) != + IMetaStoreClient.ValidTxnList.RangeResponse.NONE) { + working.add(delta); + } + } else { + findOriginals(fs, child, original); + } + } + + // Complain if all of the bases were too recent for the minimum excluded + // transaction. + if (bestBase == null && ignoredBase != null) { + throw new IllegalArgumentException("All base directories were ignored," + + " such as " + ignoredBase); + } + + // if we have a base, the original files are obsolete. + if (bestBase != null) { + obsolete.addAll(original); + // remove the entries so we don't get confused later and think we should use them. + original.clear(); + } + + Collections.sort(working); + long current = bestBaseTxn; + for(ParsedDelta next: working) { + if (next.maxTransaction > current) { + // are any of the new transactions ones that we care about? + if (txnList.isTxnRangeCommitted(current+1, next.maxTransaction) != + IMetaStoreClient.ValidTxnList.RangeResponse.NONE) { + deltas.add(next); + current = next.maxTransaction; + } + } else { + obsolete.add(next.path); + } + } + + final Path base = bestBase == null ? null : bestBase.getPath(); + + return new Directory(){ + + @Override + public Path getBaseDirectory() { + return base; + } + + @Override + public List getOriginalFiles() { + return original; + } + + @Override + public List getCurrentDirectories() { + return deltas; + } + + @Override + public List getObsolete() { + return obsolete; + } + }; + } + + private static void findOriginals(FileSystem fs, FileStatus stat, + List original) throws IOException { + if (stat.isDir()) { + Iterator iter = SHIMS.listLocatedStatus(fs, stat.getPath(), hiddenFileFilter); + while (iter.hasNext()) { + FileStatus child = iter.next(); + findOriginals(fs, child, original); + } + } else { + original.add(stat); + } + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java index 647a9a6..9d8fb19 100755 --- ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java @@ -44,11 +44,8 @@ import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.PartitionDesc; -import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; -import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; -import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/RecordIdentifier.java ql/src/java/org/apache/hadoop/hive/ql/io/RecordIdentifier.java new file mode 100644 index 0000000..89b9b68 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/RecordIdentifier.java @@ -0,0 +1,145 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.io.WritableComparable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +/** + * Gives the Record identifer information for the current record. + */ +public class RecordIdentifier implements WritableComparable { + private long transactionId; + private int bucketId; + private long rowId; + + public RecordIdentifier() { + } + + public RecordIdentifier(long transactionId, int bucket, long rowId) { + this.transactionId = transactionId; + this.bucketId = bucket; + this.rowId = rowId; + } + + /** + * Set the identifier. + * @param transactionId the transaction id + * @param bucketId the bucket id + * @param rowId the row id + */ + public void setValues(long transactionId, int bucketId, long rowId) { + this.transactionId = transactionId; + this.bucketId = bucketId; + this.rowId = rowId; + } + + /** + * Set this object to match the given object. + * @param other the object to copy from + */ + public void set(RecordIdentifier other) { + this.transactionId = other.transactionId; + this.bucketId = other.bucketId; + this.rowId = other.rowId; + } + + public void setRowId(long rowId) { + this.rowId = rowId; + } + + /** + * What was the original transaction id for the last row? + * @return the transaction id + */ + public long getTransactionId() { + return transactionId; + } + + /** + * What was the original bucket id for the last row? + * @return the bucket id + */ + public int getBucketId() { + return bucketId; + } + + /** + * What was the original row id for the last row? + * @return the row id + */ + public long getRowId() { + return rowId; + } + + protected int compareToInternal(RecordIdentifier other) { + if (other == null) { + return -1; + } + if (transactionId != other.transactionId) { + return transactionId < other.transactionId ? -1 : 1; + } + if (bucketId != other.bucketId) { + return bucketId < other.bucketId ? - 1 : 1; + } + if (rowId != other.rowId) { + return rowId < other.rowId ? -1 : 1; + } + return 0; + } + + @Override + public int compareTo(RecordIdentifier other) { + if (other.getClass() != RecordIdentifier.class) { + return -other.compareTo(this); + } + return compareToInternal(other); + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + throw new UnsupportedOperationException("Can't write RecordIdentifier"); + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + throw new UnsupportedOperationException("Can't read RecordIdentifier"); + } + + @Override + public boolean equals(Object other) { + if (other == null || other.getClass() != getClass()) { + return false; + } + RecordIdentifier oth = (RecordIdentifier) other; + return oth.transactionId == transactionId && + oth.bucketId == bucketId && + oth.rowId == rowId; + } + + @Override + public String toString() { + return "{originalTxn: " + transactionId + ", bucket: " + + bucketId + ", row: " + getRowId() + "}"; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/RecordUpdater.java ql/src/java/org/apache/hadoop/hive/ql/io/RecordUpdater.java new file mode 100644 index 0000000..c5153fc --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/RecordUpdater.java @@ -0,0 +1,92 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + +import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; + +import java.io.IOException; + +/** + * API for supporting updating records. + */ +public interface RecordUpdater { + + /** + * Insert a new record into the table. + * @param currentTransaction the transaction id of the current transaction. + * @param bucket the bucket of the row + * @param row the row of data to insert + * @throws IOException + */ + void insert(long currentTransaction, + int bucket, + Object row) throws IOException; + + /** + * Update an old record with a new set of values. + * @param currentTransaction the current transaction id + * @param originalTransaction the row's original transaction id + * @param originalBucket the row's original bucket id + * @param rowId the original row id + * @param row the new values for the row + * @throws IOException + */ + void update(long currentTransaction, + long originalTransaction, + int originalBucket, + long rowId, + Object row) throws IOException; + + /** + * Delete a row from the table. + * @param currentTransaction the current transaction id + * @param originalTransaction the rows original transaction id + * @param originalBucket the row's original bucket id + * @param rowId the original row id + * @throws IOException + */ + void delete(long currentTransaction, + long originalTransaction, + int originalBucket, + long rowId) throws IOException; + + /** + * Flush the current set of rows to the underlying file system, so that + * they are available to readers. Most implementations will need to write + * additional state information when this is called, so it should only be + * called during streaming when a transaction is finished, but the + * RecordUpdater can't be closed yet. + * @throws IOException + */ + void flush() throws IOException; + + /** + * Close this updater. No further calls are legal after this. + * @param abort Can the data since the last flush be discarded? + * @throws IOException + */ + void close(boolean abort) throws IOException; + + /** + * Returns the statistics information + * @return SerDeStats + */ + SerDeStats getStats(); +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java index 0143b53..c4ecd07 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java @@ -33,7 +33,7 @@ public static void main(String[] args) throws Exception { for(String filename: args) { System.out.println("Structure for " + filename); Path path = new Path(filename); - Reader reader = OrcFile.createReader(path.getFileSystem(conf), path, conf); + Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); RecordReaderImpl rows = (RecordReaderImpl) reader.rows(null); System.out.println("Rows: " + reader.getNumberOfRows()); System.out.println("Compression: " + reader.getCompression()); @@ -47,7 +47,8 @@ public static void main(String[] args) throws Exception { System.out.println(" Stripe " + (n + 1) + ":"); StripeStatistics ss = metadata.getStripeStatistics().get(n); for (int i = 0; i < ss.getColumnStatistics().length; ++i) { - System.out.println(" Column " + i + ": " + ss.getColumnStatistics()[i].toString()); + System.out.println(" Column " + i + ": " + + ss.getColumnStatistics()[i].toString()); } } ColumnStatistics[] stats = reader.getStatistics(); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/MemoryManager.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/MemoryManager.java index 9af12de..38e0d7d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/MemoryManager.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/MemoryManager.java @@ -120,6 +120,9 @@ synchronized void removeWriter(Path path) throws IOException { if (val != null) { writerList.remove(path); totalAllocation -= val.allocation; + if (writerList.isEmpty()) { + rowsAddedSinceCheck = 0; + } updateScale(false); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java index a56fe2f..e902161 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java @@ -114,15 +114,70 @@ private OrcFile() {} * @return a new ORC file reader. * @throws IOException */ - public static Reader createReader(FileSystem fs, Path path, - Configuration conf) throws IOException { - return new ReaderImpl(fs, path, conf); + public static Reader createReader(FileSystem fs, Path path + ) throws IOException { + ReaderOptions opts = new ReaderOptions(new Configuration()); + opts.filesystem(fs); + return new ReaderImpl(path, opts); } - public static Reader createReader(FileSystem fs, Path path, - FileMetaInfo fileMetaInfo, Configuration conf) - throws IOException { - return new ReaderImpl(fs, path, fileMetaInfo, conf); + public static class ReaderOptions { + private final Configuration conf; + private FileSystem filesystem; + private FileMetaInfo fileMetaInfo; + private long maxLength = Long.MAX_VALUE; + + ReaderOptions(Configuration conf) { + this.conf = conf; + } + ReaderOptions fileMetaInfo(FileMetaInfo info) { + fileMetaInfo = info; + return this; + } + + public ReaderOptions filesystem(FileSystem fs) { + this.filesystem = fs; + return this; + } + + public ReaderOptions maxLength(long val) { + maxLength = val; + return this; + } + + Configuration getConfiguration() { + return conf; + } + + FileSystem getFilesystem() { + return filesystem; + } + + FileMetaInfo getFileMetaInfo() { + return fileMetaInfo; + } + + long getMaxLength() { + return maxLength; + } + } + + public static ReaderOptions readerOptions(Configuration conf) { + return new ReaderOptions(conf); + } + + public static Reader createReader(Path path, + ReaderOptions options) throws IOException { + return new ReaderImpl(path, options); + } + + public static interface WriterContext { + Writer getWriter(); + } + + public static interface WriterCallback { + public void preStripeWrite(WriterContext context) throws IOException; + public void preFooterWrite(WriterContext context) throws IOException; } /** @@ -139,6 +194,7 @@ public static Reader createReader(FileSystem fs, Path path, private CompressionKind compressValue; private MemoryManager memoryManagerValue; private Version versionValue; + private WriterCallback callback; WriterOptions(Configuration conf) { configuration = conf; @@ -250,6 +306,11 @@ WriterOptions memory(MemoryManager value) { memoryManagerValue = value; return this; } + + WriterOptions callback(WriterCallback callback) { + this.callback = callback; + return this; + } } /** @@ -263,7 +324,7 @@ public static WriterOptions writerOptions(Configuration conf) { * Create an ORC file writer. This is the public interface for creating * writers going forward and new options will only be added to this method. * @param path filename to write to - * @param options the options + * @param opts the options * @return a new ORC file writer * @throws IOException */ @@ -277,7 +338,7 @@ public static Writer createWriter(Path path, opts.stripeSizeValue, opts.compressValue, opts.bufferSizeValue, opts.rowIndexStrideValue, opts.memoryManagerValue, opts.blockPaddingValue, - opts.versionValue); + opts.versionValue, opts.callback); } /** diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index 180be2f..b5d4e1d 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -36,16 +36,18 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.io.AcidInputFormat; +import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.InputFormatChecker; -import org.apache.hadoop.hive.ql.io.orc.Metadata; +import org.apache.hadoop.hive.ql.io.RecordIdentifier; import org.apache.hadoop.hive.ql.io.orc.Reader.FileMetaInfo; -import org.apache.hadoop.hive.ql.io.orc.RecordReader; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.log.PerfLogger; @@ -53,11 +55,11 @@ import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.shims.HadoopShims; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.InvalidInputException; @@ -72,9 +74,8 @@ * A MapReduce/Hive input format for ORC files. */ public class OrcInputFormat implements InputFormat, - InputFormatChecker, VectorizedInputFormatInterface { - - VectorizedOrcInputFormat voif = new VectorizedOrcInputFormat(); + InputFormatChecker, VectorizedInputFormatInterface, + AcidInputFormat { private static final Log LOG = LogFactory.getLog(OrcInputFormat.class); static final String MIN_SPLIT_SIZE = "mapred.min.split.size"; @@ -106,12 +107,12 @@ OrcRecordReader(Reader file, Configuration conf, - long offset, long length) throws IOException { + OrcSplit split) throws IOException { List types = file.getTypes(); numColumns = (types.size() == 0) ? 0 : types.get(0).getSubtypesCount(); + this.offset = split.getStart(); + this.length = split.getLength(); this.reader = createReaderFromFile(file, conf, offset, length); - this.offset = offset; - this.length = length; } @Override @@ -151,26 +152,17 @@ public float getProgress() throws IOException { } } - static RecordReader createReaderFromFile( - Reader file, Configuration conf, long offset, long length) - throws IOException { + static RecordReader createReaderFromFile(Reader file, Configuration conf, + long offset, long length + ) throws IOException { List types = file.getTypes(); - boolean[] includedColumns = findIncludedColumns(types, conf); + boolean[] includedColumns = findIncludedColumns(types, conf, true, 0); String[] columnNames = getIncludedColumnNames(types, includedColumns, - conf); - SearchArgument sarg = createSarg(types, conf); - RecordReader reader = - file.rows(offset, length, includedColumns, sarg, columnNames); - return reader; + conf, 0); + SearchArgument sarg = createSarg(conf); + return file.rows(offset, length, includedColumns, sarg, columnNames); } - private static final PathFilter hiddenFileFilter = new PathFilter(){ - public boolean accept(Path p){ - String name = p.getName(); - return !name.startsWith("_") && !name.startsWith("."); - } - }; - /** * Recurse down into a type subtree turning on all of the sub-columns. * @param types the types of the file @@ -188,7 +180,7 @@ static void includeColumnRecursive(List types, } } - public static SearchArgument createSarg(List types, Configuration conf) { + static SearchArgument createSarg(Configuration conf) { String serializedPushdown = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR); if (serializedPushdown == null || conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) == null) { @@ -201,19 +193,24 @@ public static SearchArgument createSarg(List types, Configuration return sarg; } - public static String[] getIncludedColumnNames( - List types, boolean[] includedColumns, Configuration conf) { - String columnNamesString = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR); + public static String[] + getIncludedColumnNames(List types, + boolean[] includedColumns, + Configuration conf, + int rootColumn) { + String columnNamesString = + conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR); if (LOG.isDebugEnabled()) { LOG.debug("included columns names = " + columnNamesString); } - if (columnNamesString == null || conf.get(TableScanDesc.FILTER_EXPR_CONF_STR) == null) { + if (columnNamesString == null || + conf.get(TableScanDesc.FILTER_EXPR_CONF_STR) == null) { return null; } String[] neededColumnNames = columnNamesString.split(","); int i = 0; String[] columnNames = new String[types.size()]; - for(int columnId: types.get(0).getSubtypesList()) { + for(int columnId: types.get(rootColumn).getSubtypesList()) { if (includedColumns == null || includedColumns[columnId]) { columnNames[columnId] = neededColumnNames[i++]; } @@ -225,17 +222,28 @@ public static SearchArgument createSarg(List types, Configuration * Take the configuration and figure out which columns we need to include. * @param types the types of the file * @param conf the configuration + * @param isOriginal is the file the non-acid layout? + * @param rootColumn which column id is the user visible column * @return true for each column that should be included */ - public static boolean[] findIncludedColumns(List types, Configuration conf) { - LOG.info("included column ids = " + conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); + public static boolean[] findIncludedColumns(List types, + Configuration conf, + boolean isOriginal, + int rootColumn) { + LOG.info("included column ids = " + + conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR)); if (ColumnProjectionUtils.isReadAllColumns(conf)) { return null; } else { int numColumns = types.size(); boolean[] result = new boolean[numColumns]; result[0] = true; - OrcProto.Type root = types.get(0); + if (!isOriginal) { + for(int i=1; i < rootColumn; i++) { + result[i] = true; + } + } + OrcProto.Type root = types.get(rootColumn); List included = ColumnProjectionUtils.getReadColumnIDs(conf); for(int i=0; i < root.getSubtypesCount(); ++i) { if (included.contains(i)) { @@ -252,46 +260,13 @@ public static SearchArgument createSarg(List types, Configuration } } - @SuppressWarnings("unchecked") - @Override - public org.apache.hadoop.mapred.RecordReader - getRecordReader(InputSplit inputSplit, JobConf conf, - Reporter reporter) throws IOException { - if (isVectorMode(conf)) { - org.apache.hadoop.mapred.RecordReader vorr = voif.getRecordReader(inputSplit, conf, - reporter); - return (org.apache.hadoop.mapred.RecordReader) vorr; - } - FileSplit fSplit = (FileSplit)inputSplit; - reporter.setStatus(fSplit.toString()); - Path path = fSplit.getPath(); - FileSystem fs = path.getFileSystem(conf); - Reader reader = null; - - if(!(fSplit instanceof OrcSplit)){ - //If CombineHiveInputFormat is used, it works with FileSplit and not OrcSplit - reader = OrcFile.createReader(fs, path, conf); - } else { - //We have OrcSplit, which may have footer metadata cached, so use the appropriate reader - //constructor - OrcSplit orcSplit = (OrcSplit) fSplit; - if (orcSplit.hasFooter()) { - FileMetaInfo fMetaInfo = orcSplit.getFileMetaInfo(); - reader = OrcFile.createReader(fs, path, fMetaInfo, conf); - } else { - reader = OrcFile.createReader(fs, path, conf); - } - } - return new OrcRecordReader(reader, conf, fSplit.getStart(), fSplit.getLength()); - } - @Override public boolean validateInput(FileSystem fs, HiveConf conf, ArrayList files ) throws IOException { if (isVectorMode(conf)) { - return voif.validateInput(fs, conf, files); + return new VectorizedOrcInputFormat().validateInput(fs, conf, files); } if (files.size() <= 0) { @@ -299,7 +274,8 @@ public boolean validateInput(FileSystem fs, HiveConf conf, } for (FileStatus file : files) { try { - OrcFile.createReader(fs, file.getPath(), conf); + OrcFile.createReader(file.getPath(), + OrcFile.readerOptions(conf).filesystem(fs)); } catch (IOException e) { return false; } @@ -335,41 +311,11 @@ private boolean isVectorMode(Configuration conf) { * the different worker threads. */ static class Context { - static class FileSplitInfo { - FileSplitInfo(Path file, long start, long length, String[] hosts, - FileMetaInfo fileMetaInfo) { - this.file = file; - this.start = start; - this.length = length; - this.hosts = hosts; - this.fileMetaInfo = fileMetaInfo; - } - Path getPath() { - return file; - } - long getStart() { - return start; - } - long getLength() { - return length; - } - String[] getLocations() { - return hosts; - } - FileMetaInfo getFileMetaInfo() { - return fileMetaInfo; - } - private Path file; - private long start; - private long length; - private String[] hosts; - FileMetaInfo fileMetaInfo; - } private final Configuration conf; private static Cache footerCache; private final ExecutorService threadPool; - private final List splits = - new ArrayList(10000); + private final List splits = + new ArrayList(10000); private final List errors = new ArrayList(); private final HadoopShims shims = ShimLoader.getHadoopShims(); private final long maxSize; @@ -379,6 +325,7 @@ FileMetaInfo getFileMetaInfo() { private final AtomicInteger cacheHitCounter = new AtomicInteger(0); private final AtomicInteger numFilesCounter = new AtomicInteger(0); private Throwable fatalError = null; + private IMetaStoreClient.ValidTxnList transactionList; /** * A count of the number of threads that may create more work for the @@ -390,15 +337,18 @@ FileMetaInfo getFileMetaInfo() { this.conf = conf; minSize = conf.getLong(MIN_SPLIT_SIZE, DEFAULT_MIN_SPLIT_SIZE); maxSize = conf.getLong(MAX_SPLIT_SIZE, DEFAULT_MAX_SPLIT_SIZE); - footerInSplits = HiveConf.getBoolVar(conf, ConfVars.HIVE_ORC_INCLUDE_FILE_FOOTER_IN_SPLITS); + footerInSplits = HiveConf.getBoolVar(conf, + ConfVars.HIVE_ORC_INCLUDE_FILE_FOOTER_IN_SPLITS); int cacheStripeDetailsSize = HiveConf.getIntVar(conf, ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_SIZE); - int numThreads = HiveConf.getIntVar(conf, ConfVars.HIVE_ORC_COMPUTE_SPLITS_NUM_THREADS); + int numThreads = HiveConf.getIntVar(conf, + ConfVars.HIVE_ORC_COMPUTE_SPLITS_NUM_THREADS); cacheStripeDetails = (cacheStripeDetailsSize > 0); threadPool = Executors.newFixedThreadPool(numThreads, - new ThreadFactoryBuilder().setDaemon(true).setNameFormat("ORC_GET_SPLITS #%d").build()); + new ThreadFactoryBuilder().setDaemon(true) + .setNameFormat("ORC_GET_SPLITS #%d").build()); synchronized (Context.class) { if (footerCache == null && cacheStripeDetails) { @@ -406,6 +356,9 @@ FileMetaInfo getFileMetaInfo() { .initialCapacity(cacheStripeDetailsSize).softValues().build(); } } + transactionList = new HiveMetaStoreClient.ValidTxnListImpl(); + transactionList.fromString(conf.get + (IMetaStoreClient.ValidTxnList.VALID_TXNS_KEY, Long.MAX_VALUE + ":")); } int getSchedulers() { @@ -416,9 +369,9 @@ int getSchedulers() { * Get the Nth split. * @param index if index >= 0, count from the front, otherwise count from * the back. - * @result the Nth file split + * @return the Nth file split */ - FileSplitInfo getResult(int index) { + OrcSplit getResult(int index) { if (index >= 0) { return splits.get(index); } else { @@ -436,7 +389,8 @@ FileSplitInfo getResult(int index) { */ synchronized void schedule(Runnable runnable) { if (fatalError == null) { - if (runnable instanceof FileGenerator || runnable instanceof SplitGenerator) { + if (runnable instanceof FileGenerator || + runnable instanceof SplitGenerator) { schedulers += 1; } threadPool.execute(runnable); @@ -497,23 +451,49 @@ synchronized void waitForTasks() { this.dir = dir; } + private void scheduleSplits(FileStatus file, + boolean isOriginal, + boolean hasBase, + List deltas) throws IOException{ + FileInfo info = null; + if (context.cacheStripeDetails) { + info = verifyCachedFileInfo(file); + } + new SplitGenerator(context, fs, file, info, isOriginal, deltas, + hasBase).schedule(); + } + /** * For each path, get the list of files and blocks that they consist of. */ @Override public void run() { try { - Iterator itr = context.shims.listLocatedStatus(fs, dir, - hiddenFileFilter); - while (itr.hasNext()) { - FileStatus file = itr.next(); - if (!file.isDir()) { - FileInfo fileInfo = null; - if (context.cacheStripeDetails) { - fileInfo = verifyCachedFileInfo(file); + AcidUtils.Directory dirInfo = AcidUtils.getAcidState(dir, + context.conf, context.transactionList); + List deltas = + AcidUtils.serializeDeltas(dirInfo.getCurrentDirectories()); + boolean hasBase = dirInfo.getBaseDirectory() != null || + !dirInfo.getOriginalFiles().isEmpty(); + if (hasBase) { + List originals = dirInfo.getOriginalFiles(); + if (originals.isEmpty()) { + Iterator itr = context.shims.listLocatedStatus(fs, + dirInfo.getBaseDirectory(),AcidUtils.hiddenFileFilter); + while (itr.hasNext()) { + scheduleSplits(itr.next(), false, true, deltas); + } + } else { + for(FileStatus file: originals) { + scheduleSplits(file, true, true, deltas); } - SplitGenerator spgen = new SplitGenerator(context, fs, file, fileInfo); - spgen.schedule(); + } + } else { + int numBuckets = + context.conf.getInt(hive_metastoreConstants.BUCKET_COUNT, 0); + for(int b=0; b < numBuckets; ++b) { + context.splits.add(new OrcSplit(dir, b, 0, new String[0], null, + false, false, deltas)); } } } catch (Throwable th) { @@ -538,7 +518,8 @@ private FileInfo verifyCachedFileInfo(FileStatus file) { if (LOG.isDebugEnabled()) { LOG.debug("Info cached for path: " + file.getPath()); } - if (fileInfo.modificationTime == file.getModificationTime() && fileInfo.size == file.getLen()) { + if (fileInfo.modificationTime == file.getModificationTime() && + fileInfo.size == file.getLen()) { // Cached copy is valid context.cacheHitCounter.incrementAndGet(); return fileInfo; @@ -546,10 +527,12 @@ private FileInfo verifyCachedFileInfo(FileStatus file) { // Invalidate Context.footerCache.invalidate(file.getPath()); if (LOG.isDebugEnabled()) { - LOG.debug("Meta-Info for : " + file.getPath() + " changed. CachedModificationTime: " + LOG.debug("Meta-Info for : " + file.getPath() + + " changed. CachedModificationTime: " + fileInfo.modificationTime + ", CurrentModificationTime: " + file.getModificationTime() - + ", CachedLength: " + fileInfo.size + ", CurrentLength: " + file.getLen()); + + ", CachedLength: " + fileInfo.size + ", CurrentLength: " + + file.getLen()); } } } else { @@ -576,16 +559,24 @@ private FileInfo verifyCachedFileInfo(FileStatus file) { private FileMetaInfo fileMetaInfo; private Metadata metadata; private List types; - + private final boolean isOriginal; + private final List deltas; + private final boolean hasBase; SplitGenerator(Context context, FileSystem fs, - FileStatus file, FileInfo fileInfo) throws IOException { + FileStatus file, FileInfo fileInfo, + boolean isOriginal, + List deltas, + boolean hasBase) throws IOException { this.context = context; this.fs = fs; this.file = file; this.blockSize = file.getBlockSize(); this.fileInfo = fileInfo; locations = context.shims.getLocations(fs, file); + this.isOriginal = isOriginal; + this.deltas = deltas; + this.hasBase = hasBase; } Path getPath() { @@ -596,8 +587,8 @@ void schedule() throws IOException { if(locations.length == 1 && file.getLen() < context.maxSize) { String[] hosts = locations[0].getHosts(); synchronized (context.splits) { - context.splits.add(new Context.FileSplitInfo(file.getPath(), 0, - file.getLen(), hosts, fileMetaInfo)); + context.splits.add(new OrcSplit(file.getPath(), 0, file.getLen(), + hosts, fileMetaInfo, isOriginal, hasBase, deltas)); } } else { // if it requires a compute task @@ -639,7 +630,8 @@ static long getOverlap(long offset1, long length1, * @param fileMetaInfo file metadata from footer and postscript * @throws IOException */ - void createSplit(long offset, long length, FileMetaInfo fileMetaInfo) throws IOException { + void createSplit(long offset, long length, + FileMetaInfo fileMetaInfo) throws IOException { String[] hosts; if ((offset % blockSize) + length <= blockSize) { // handle the single block case @@ -683,8 +675,8 @@ void createSplit(long offset, long length, FileMetaInfo fileMetaInfo) throws IOE hostList.toArray(hosts); } synchronized (context.splits) { - context.splits.add(new Context.FileSplitInfo(file.getPath(), offset, - length, hosts, fileMetaInfo)); + context.splits.add(new OrcSplit(file.getPath(), offset, length, + hosts, fileMetaInfo, isOriginal, hasBase, deltas)); } } @@ -697,23 +689,33 @@ public void run() { try { populateAndCacheStripeDetails(); Configuration conf = context.conf; - SearchArgument sarg = createSarg(types, conf); + SearchArgument sarg; + if (deltas.isEmpty()) { + sarg = createSarg(conf); + } else { + sarg = null; + } List stripeStats = null; int[] filterColumns = null; if (sarg != null) { - List sargLeaves = null; - String[] allColumns = conf.get(serdeConstants.LIST_COLUMNS).split(","); - String[] neededColumns = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR).split(","); + List sargLeaves; + String[] allColumns = + conf.get(serdeConstants.LIST_COLUMNS).split(","); + String[] neededColumns = + conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) + .split(","); sargLeaves = sarg.getLeaves(); filterColumns = new int[sargLeaves.size()]; for (int i = 0; i < filterColumns.length; ++i) { String colName = sargLeaves.get(i).getColumnName(); - // if needed columns does not contain the column specified in filter expression then - // it must be partition column. There will not be columns within ORC file for partitioned - // column, so we can ignore them + // if needed columns does not contain the column specified in filter + // expression then it must be partition column. There will not be + // columns within ORC file for partitioned column, so we can ignore + // them if (containsColumn(neededColumns, colName)) { - filterColumns[i] = RecordReaderImpl.findColumns(allColumns, colName); + filterColumns[i] = RecordReaderImpl.findColumns(allColumns, + colName); } else { filterColumns[i] = -1; } @@ -732,11 +734,13 @@ public void run() { if (sarg != null && stripeStats != null && idx < stripeStats.size() && - !isStripeSatisfyPredicate(stripeStats.get(idx), sarg, filterColumns)) { + !isStripeSatisfyPredicate(stripeStats.get(idx), sarg, + filterColumns)) { // if a stripe doesn't satisfy predicate condition then skip it if (LOG.isDebugEnabled()) { - LOG.debug("Eliminating ORC stripe-" + idx + " of file '" + file.getPath() + LOG.debug("Eliminating ORC stripe-" + idx + " of file '" + + file.getPath() + "' as it did not satisfy predicate condition."); } @@ -797,23 +801,26 @@ private void populateAndCacheStripeDetails() { types = fileInfo.types; // For multiple runs, in case sendSplitsInFooter changes if (fileMetaInfo == null && context.footerInSplits) { - orcReader = OrcFile.createReader(fs, file.getPath(), context.conf); + orcReader = OrcFile.createReader(file.getPath(), + OrcFile.readerOptions(context.conf).filesystem(fs)); fileInfo.fileMetaInfo = orcReader.getFileMetaInfo(); fileInfo.metadata = orcReader.getMetadata(); fileInfo.types = orcReader.getTypes(); } } if (!found) { - orcReader = OrcFile.createReader(fs, file.getPath(), context.conf); + orcReader = OrcFile.createReader(file.getPath(), + OrcFile.readerOptions(context.conf).filesystem(fs)); stripes = orcReader.getStripes(); metadata = orcReader.getMetadata(); types = orcReader.getTypes(); - fileMetaInfo = context.footerInSplits ? orcReader.getFileMetaInfo() : null; + fileMetaInfo = context.footerInSplits ? + orcReader.getFileMetaInfo() : null; if (context.cacheStripeDetails) { // Populate into cache. Context.footerCache.put(file.getPath(), - new FileInfo(file.getModificationTime(), file.getLen(), stripes, metadata, - types, fileMetaInfo)); + new FileInfo(file.getModificationTime(), file.getLen(), stripes, + metadata, types, fileMetaInfo)); } } } catch (Throwable th) { @@ -847,10 +854,12 @@ private boolean isStripeSatisfyPredicate(StripeStatistics stripeStatistics, if (filterColumns[pred] != -1) { // column statistics at index 0 contains only the number of rows - ColumnStatistics stats = stripeStatistics.getColumnStatistics()[filterColumns[pred] + 1]; + ColumnStatistics stats = + stripeStatistics.getColumnStatistics()[filterColumns[pred] + 1]; Object minValue = getMin(stats); Object maxValue = getMax(stats); - truthValues[pred] = RecordReaderImpl.evaluatePredicateRange(predLeaves.get(pred), + truthValues[pred] = + RecordReaderImpl.evaluatePredicateRange(predLeaves.get(pred), minValue, maxValue); } else { @@ -894,7 +903,7 @@ private Object getMin(ColumnStatistics index) { } } - static List generateSplitsInfo(Configuration conf) + static List generateSplitsInfo(Configuration conf) throws IOException { // use threads to resolve directories into splits Context context = new Context(conf); @@ -922,20 +931,14 @@ private Object getMin(ColumnStatistics index) { } return context.splits; } + @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.ORC_GET_SPLITS); - List splits = - OrcInputFormat.generateSplitsInfo(job); - InputSplit[] result = new InputSplit[splits.size()]; - for (int i=0;i result = generateSplitsInfo(job); perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.ORC_GET_SPLITS); - return result; + return result.toArray(new InputSplit[result.size()]); } /** @@ -953,8 +956,10 @@ private Object getMin(ColumnStatistics index) { List types; - FileInfo(long modificationTime, long size, Iterable stripeInfos, - Metadata metadata, List types, FileMetaInfo fileMetaInfo) { + FileInfo(long modificationTime, long size, + Iterable stripeInfos, + Metadata metadata, List types, + FileMetaInfo fileMetaInfo) { this.modificationTime = modificationTime; this.size = size; this.stripeInfos = stripeInfos; @@ -963,4 +968,186 @@ private Object getMin(ColumnStatistics index) { this.types = types; } } + + @SuppressWarnings("unchecked") + private org.apache.hadoop.mapred.RecordReader + createVectorizedReader(InputSplit split, JobConf conf, Reporter reporter + ) throws IOException { + return (org.apache.hadoop.mapred.RecordReader) + new VectorizedOrcInputFormat().getRecordReader(split, conf, reporter); + } + + @Override + public org.apache.hadoop.mapred.RecordReader + getRecordReader(InputSplit inputSplit, JobConf conf, + Reporter reporter) throws IOException { + // TODO vectorized reader doesn't work with the new format yet + if (isVectorMode(conf)) { + return createVectorizedReader(inputSplit, conf, reporter); + } + reporter.setStatus(inputSplit.toString()); + OrcSplit split = (OrcSplit) inputSplit; + // if we are strictly old-school, just use the old code + if (split.isOriginal() && split.getDeltas().isEmpty()) { + return new OrcRecordReader(OrcFile.createReader(split.getPath(), + OrcFile.readerOptions(conf)), conf, split); + } + Options options = new Options(conf).reporter(reporter); + final RowReader inner = getReader(inputSplit, options); + final RecordIdentifier id = inner.createKey(); + return new org.apache.hadoop.mapred.RecordReader(){ + @Override + public boolean next(NullWritable nullWritable, + OrcStruct orcStruct) throws IOException { + return inner.next(id, orcStruct); + } + + @Override + public NullWritable createKey() { + return NullWritable.get(); + } + + @Override + public OrcStruct createValue() { + return inner.createValue(); + } + + @Override + public long getPos() throws IOException { + return inner.getPos(); + } + + @Override + public void close() throws IOException { + inner.close(); + } + + @Override + public float getProgress() throws IOException { + return inner.getProgress(); + } + }; + } + + + @Override + public RowReader getReader(InputSplit inputSplit, + Options options) throws IOException { + final OrcSplit split = (OrcSplit) inputSplit; + final Path path = split.getPath(); + final Path[] deltas = AcidUtils.deserializeDeltas(path, + split.getDeltas()); + final Configuration conf = options.getConfiguration(); + final Reader reader; + final boolean[] includedColumns; + final String[] columnNames; + final SearchArgument sarg; + final int bucket; + if (split.hasBase()) { + bucket = AcidUtils.parseBaseBucketFilename(split.getPath(), conf) + .getBucket(); + reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); + final List types = reader.getTypes(); + final int rootColumn = split.isOriginal() ? 0 : OrcRecordUpdater.ROW + 1; + includedColumns = findIncludedColumns(types, conf, + split.isOriginal(), rootColumn); + columnNames = getIncludedColumnNames(types, includedColumns, + conf, rootColumn); + sarg = createSarg(conf); + } else { + bucket = (int) split.getStart(); + reader = null; + //TODO really need column projection even for base-less input splits + includedColumns = null; + columnNames = null; + sarg = null; + } + final OrcRawRecordMerger records = + new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, + split.getStart(), split.getLength(), includedColumns, sarg, + columnNames, deltas); + return new RowReader() { + @Override + public ObjectInspector getObjectInspector() { + // TODO + return null; + } + + @Override + public boolean next(RecordIdentifier recordIdentifier, + OrcStruct orcStruct) throws IOException { + boolean result; + // filter out the deleted records + do { + result = records.next(recordIdentifier, orcStruct); + } while (result && + OrcRecordUpdater.getOperation(orcStruct) == + OrcRecordUpdater.DELETE_OPERATION); + return result; + } + + @Override + public RecordIdentifier createKey() { + return records.createKey(); + } + + @Override + public OrcStruct createValue() { + return records.createValue(); + } + + @Override + public long getPos() throws IOException { + return records.getPos(); + } + + @Override + public void close() throws IOException { + records.close(); + } + + @Override + public float getProgress() throws IOException { + return records.getProgress(); + } + }; + } + + static Path findOriginalBucket(FileSystem fs, + Path directory, + int bucket) throws IOException { + for(FileStatus stat: fs.listStatus(directory)) { + String name = stat.getPath().getName(); + if (Integer.parseInt(name.substring(0, name.indexOf('_'))) == bucket) { + return stat.getPath(); + } + } + throw new IllegalArgumentException("Can't find bucket " + bucket + " in " + + directory); + } + + @Override + public RawReader getRawReader(Configuration conf, + boolean collapseEvents, + int bucket, Path baseDirectory, + Path... deltaDirectory + ) throws IOException { + Reader reader = null; + boolean isOriginal = false; + if (baseDirectory != null) { + Path bucketFile; + if (baseDirectory.getName().startsWith(AcidUtils.BASE_PREFIX)) { + bucketFile = AcidUtils.createBucketFile(baseDirectory, bucket); + } else { + isOriginal = true; + bucketFile = findOriginalBucket(baseDirectory.getFileSystem(conf), + baseDirectory, bucket); + } + reader = OrcFile.createReader(bucketFile, OrcFile.readerOptions(conf)); + } + return new OrcRawRecordMerger(conf, collapseEvents, reader, isOriginal, + bucket, 0, Long.MAX_VALUE, null, null, null, deltaDirectory); + } + + } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewInputFormat.java index ec477e2..ab3164e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewInputFormat.java @@ -48,8 +48,8 @@ Path path = fileSplit.getPath(); Configuration conf = ShimLoader.getHadoopShims() .getConfiguration(context); - FileSystem fs = path.getFileSystem(conf); - return new OrcRecordReader(OrcFile.createReader(fs, path, conf), + return new OrcRecordReader(OrcFile.createReader(path, + OrcFile.readerOptions(conf)), ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength()); } @@ -118,15 +118,14 @@ public boolean nextKeyValue() throws IOException, InterruptedException { public List getSplits(JobContext jobContext) throws IOException, InterruptedException { perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.ORC_GET_SPLITS); - List splits = + Configuration conf = + ShimLoader.getHadoopShims().getConfiguration(jobContext); + List splits = OrcInputFormat.generateSplitsInfo(ShimLoader.getHadoopShims() .getConfiguration(jobContext)); List result = new ArrayList(); - for (OrcInputFormat.Context.FileSplitInfo split : splits) { - FileSplit newSplit = new OrcNewSplit(split.getPath(), - split.getStart(), split.getLength(), split.getLocations(), - split.getFileMetaInfo()); - result.add(newSplit); + for(OrcSplit split: OrcInputFormat.generateSplitsInfo(conf)) { + result.add(new OrcNewSplit(split)); } perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.ORC_GET_SPLITS); return result; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java index 3a5ba1b..351e141 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcNewSplit.java @@ -21,6 +21,7 @@ import java.io.DataOutput; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.orc.Reader.FileMetaInfo; @@ -33,75 +34,69 @@ * */ public class OrcNewSplit extends FileSplit { - private Reader.FileMetaInfo fileMetaInfo; - private boolean hasFooter; - + private final OrcSplit inner; + protected OrcNewSplit(){ //The FileSplit() constructor in hadoop 0.20 and 1.x is package private so can't use it. //This constructor is used to create the object and then call readFields() // so just pass nulls to this super constructor. super(null, 0, 0, (String[])null); + inner = new OrcSplit(); } - public OrcNewSplit(Path path, long offset, long length, String[] hosts, - FileMetaInfo fileMetaInfo) { - super(path, offset, length, hosts); - this.fileMetaInfo = fileMetaInfo; - hasFooter = this.fileMetaInfo != null; + public OrcNewSplit(OrcSplit inner) throws IOException { + super(inner.getPath(), inner.getStart(), inner.getLength(), + inner.getLocations()); + this.inner = inner; } @Override public void write(DataOutput out) throws IOException { - //serialize path, offset, length using FileSplit - super.write(out); - - // Whether footer information follows. - out.writeBoolean(hasFooter); - - if (hasFooter) { - // serialize FileMetaInfo fields - Text.writeString(out, fileMetaInfo.compressionType); - WritableUtils.writeVInt(out, fileMetaInfo.bufferSize); - WritableUtils.writeVInt(out, fileMetaInfo.metadataSize); - - // serialize FileMetaInfo field footer - ByteBuffer footerBuff = fileMetaInfo.footerBuffer; - footerBuff.reset(); - - // write length of buffer - WritableUtils.writeVInt(out, footerBuff.limit() - footerBuff.position()); - out.write(footerBuff.array(), footerBuff.position(), - footerBuff.limit() - footerBuff.position()); - } + inner.write(out); } @Override public void readFields(DataInput in) throws IOException { - //deserialize path, offset, length using FileSplit - super.readFields(in); - - hasFooter = in.readBoolean(); + inner.readFields(in); + } - if (hasFooter) { - // deserialize FileMetaInfo fields - String compressionType = Text.readString(in); - int bufferSize = WritableUtils.readVInt(in); - int metadataSize = WritableUtils.readVInt(in); + @Override + public Path getPath() { + return inner.getPath(); + } - // deserialize FileMetaInfo field footer - int footerBuffSize = WritableUtils.readVInt(in); - ByteBuffer footerBuff = ByteBuffer.allocate(footerBuffSize); - in.readFully(footerBuff.array(), 0, footerBuffSize); + @Override + public long getStart() { + return inner.getStart(); + } - fileMetaInfo = new FileMetaInfo(compressionType, bufferSize, metadataSize, footerBuff); - } + @Override + public long getLength() { + return inner.getLength(); } public FileMetaInfo getFileMetaInfo(){ - return fileMetaInfo; + return inner.getFileMetaInfo(); } public boolean hasFooter() { - return hasFooter; + return inner.hasFooter(); + } + + public boolean isOriginal() { + return inner.isOriginal(); + } + + public boolean hasBase() { + return inner.hasBase(); + } + + public List getDeltas() { + return inner.getDeltas(); + } + + @Override + public String[] getLocations() throws IOException { + return inner.getLocations(); } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java index 62e7b34..fc19a26 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java @@ -17,14 +17,23 @@ */ package org.apache.hadoop.hive.ql.io.orc; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat; +import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.FSRecordWriter; -import org.apache.hadoop.hive.ql.io.HiveOutputFormat; +import org.apache.hadoop.hive.ql.io.RecordIdentifier; +import org.apache.hadoop.hive.ql.io.RecordUpdater; import org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow; import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileOutputFormat; @@ -34,6 +43,11 @@ import org.apache.hadoop.util.Progressable; import java.io.IOException; +import java.io.PrintStream; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; import java.util.ArrayList; import java.util.Properties; @@ -41,7 +55,7 @@ * A Hive OutputFormat for ORC files. */ public class OrcOutputFormat extends FileOutputFormat - implements HiveOutputFormat { + implements AcidOutputFormat { private static class OrcRecordWriter implements RecordWriter, @@ -160,4 +174,138 @@ public SerDeStats getStats() { return new OrcRecordWriter(path, options); } + + private class DummyOrcRecordUpdater implements RecordUpdater { + private final Path path; + private final ObjectInspector inspector; + private final PrintStream out; + + private DummyOrcRecordUpdater(Path path, Options options) { + this.path = path; + this.inspector = options.getInspector(); + this.out = options.getDummyStream(); + } + + @Override + public void insert(long currentTransaction, int bucket, + Object row) throws IOException { + out.println("insert " + path + " currTxn: " + currentTransaction + + " bucket: " + bucket + " obj: " + stringifyObject(row, inspector)); + } + + @Override + public void update(long currentTransaction, long originalTransaction, + int originalBucket, long rowId, + Object row) throws IOException { + out.println("update " + path + " currTxn: " + currentTransaction + + " origTxn: " + originalTransaction + " bucket: " + originalBucket + + " row: " + rowId + " obj: " + stringifyObject(row, inspector)); + } + + @Override + public void delete(long currentTransaction, long originalTransaction, + int originalBucket, long rowId) throws IOException { + out.println("delete " + path + " currTxn: " + currentTransaction + + " origTxn: " + originalTransaction + " bucket: " + originalBucket + + " row: " + rowId); + } + + @Override + public void flush() throws IOException { + out.println("flush " + path); + } + + @Override + public void close(boolean abort) throws IOException { + out.println("close " + path); + } + + @Override + public SerDeStats getStats() { + return null; + } + + private void stringifyObject(StringBuilder buffer, + Object obj, + ObjectInspector inspector + ) throws IOException { + if (inspector instanceof StructObjectInspector) { + buffer.append("{ "); + StructObjectInspector soi = (StructObjectInspector) inspector; + boolean isFirst = true; + for(StructField field: soi.getAllStructFieldRefs()) { + if (isFirst) { + isFirst = false; + } else { + buffer.append(", "); + } + buffer.append(field.getFieldName()); + buffer.append(": "); + stringifyObject(buffer, soi.getStructFieldData(obj, field), + field.getFieldObjectInspector()); + } + buffer.append(" }"); + } else if (inspector instanceof PrimitiveObjectInspector) { + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) inspector; + buffer.append(poi.getPrimitiveJavaObject(obj).toString()); + } else { + buffer.append("*unknown*"); + } + } + + private String stringifyObject(Object obj, + ObjectInspector inspector + ) throws IOException { + StringBuilder buffer = new StringBuilder(); + stringifyObject(buffer, obj, inspector); + return buffer.toString(); + } + } + + @Override + public RecordUpdater getRecordUpdater(Path path, + Options options) throws IOException { + if (options.getDummyStream() != null) { + return new DummyOrcRecordUpdater(path, options); + } else { + return new OrcRecordUpdater(path, options); + } + } + + @Override + public FSRecordWriter getRawRecordWriter(Path path, + Options options) throws IOException { + final Path filename = AcidUtils.createFilename(path, options); + final OrcFile.WriterOptions opts = + OrcFile.writerOptions(options.getConfiguration()); + if (!options.isWritingBase()) { + opts.bufferSize(OrcRecordUpdater.DELTA_BUFFER_SIZE) + .stripeSize(OrcRecordUpdater.DELTA_STRIPE_SIZE) + .blockPadding(false) + .compress(CompressionKind.NONE) + .rowIndexStride(0); + } + final OrcRecordUpdater.KeyIndexBuilder watcher = + new OrcRecordUpdater.KeyIndexBuilder(); + opts.inspector(options.getInspector()) + .callback(watcher); + final Writer writer = OrcFile.createWriter(filename, opts); + return new FSRecordWriter() { + @Override + public void write(Writable w) throws IOException { + OrcStruct orc = (OrcStruct) w; + watcher.addKey( + ((LongWritable) + orc.getFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION)).get(), + ((IntWritable) orc.getFieldValue(OrcRecordUpdater.BUCKET)).get(), + ((LongWritable) orc.getFieldValue(OrcRecordUpdater.ROW_ID)).get()); + writer.addRow(w); + } + + @Override + public void close(boolean abort) throws IOException { + writer.close(); + } + }; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java new file mode 100644 index 0000000..724384f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRawRecordMerger.java @@ -0,0 +1,474 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.AcidInputFormat; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.RecordIdentifier; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +public class OrcRawRecordMerger implements AcidInputFormat.RawReader { + private final Configuration conf; + private final boolean collapse; + private final RecordReader baseReader; + private final long offset; + private final long length; + private ReaderKey prevKey = new ReaderKey(); + // this is the key less than the lowest key we need to process + private RecordIdentifier minKey; + // this is the last key we need to process + private RecordIdentifier maxKey; + + final static class ReaderKey extends RecordIdentifier{ + private long currentTransactionId; + + public ReaderKey() { + this(-1, -1, -1, -1); + } + + public ReaderKey(long originalTransaction, int bucket, long rowId, + long currentTransactionId) { + super(originalTransaction, bucket, rowId); + this.currentTransactionId = currentTransactionId; + } + + @Override + public void set(RecordIdentifier other) { + super.set(other); + currentTransactionId = ((ReaderKey) other).currentTransactionId; + } + + public void setValues(long originalTransactionId, + int bucket, + long rowId, + long currentTransactionId) { + setValues(originalTransactionId, bucket, rowId); + this.currentTransactionId = currentTransactionId; + } + + @Override + public boolean equals(Object other) { + return super.equals(other) && + other.getClass() == ReaderKey.class && + currentTransactionId == ((ReaderKey) other).currentTransactionId; + } + + @Override + public int compareTo(RecordIdentifier other) { + int sup = compareToInternal(other); + if (sup == 0) { + if (other.getClass() == ReaderKey.class) { + ReaderKey oth = (ReaderKey) other; + if (currentTransactionId != oth.currentTransactionId) { + return currentTransactionId < oth.currentTransactionId ? +1 : -1; + } + } else { + return -1; + } + } + return sup; + } + + public long getCurrentTransactionId() { + return currentTransactionId; + } + + /** + * Compare rows without considering the currentTransactionId. + * @param other the value to compare to + * @return -1, 0, +1 + */ + public int compareRow(RecordIdentifier other) { + return compareToInternal(other); + } + + @Override + public String toString() { + return "{originalTxn: " + getTransactionId() + ", bucket: " + + getBucketId() + ", row: " + getRowId() + ", currentTxn: " + + currentTransactionId + "}"; + } + } + + static class ReaderPair { + OrcStruct nextRecord; + final RecordReader recordReader; + final ReaderKey key; + final RecordIdentifier maxKey; + final int bucket; + + ReaderPair(ReaderKey key, Reader reader, int bucket, + RecordIdentifier minKey, RecordIdentifier maxKey, boolean[] include) throws IOException { + this(key, reader, bucket, minKey, maxKey, 0, include, null, null); + } + + ReaderPair(ReaderKey key, Reader reader, int bucket, + RecordIdentifier minKey, RecordIdentifier maxKey, long offset, + boolean[] include, SearchArgument sarg, + String[] columnNames) throws IOException { + this.key = key; + this.maxKey = maxKey; + this.bucket = bucket; + // TODO use stripe statistics to jump over stripes + recordReader = reader.rows(offset, Long.MAX_VALUE, include, sarg, + columnNames); + // advance the reader until we reach the minimum key + do { + next(); + } while (nextRecord != null && + (minKey != null && key.compareRow(minKey) <= 0)); + } + + void next() throws IOException { + if (recordReader.hasNext()) { + nextRecord = (OrcStruct) recordReader.next(nextRecord); + // set the key + key.setValues(OrcRecordUpdater.getOriginalTransaction(nextRecord), + OrcRecordUpdater.getBucket(nextRecord), + OrcRecordUpdater.getRowId(nextRecord), + OrcRecordUpdater.getCurrentTransaction(nextRecord)); + + // if this record is larger than maxKey, we need to stop + if (maxKey != null && key.compareRow(maxKey) > 0) { + nextRecord = null; + recordReader.close(); + } + } else { + nextRecord = null; + recordReader.close(); + } + } + } + + /** + * A reader that pretends an original base file is a new version base file. + * It wraps the underlying reader's row with an ACID event object and + * makes the relevant translations. + */ + static final class OriginalReaderPair extends ReaderPair { + private LongWritable rowId; + private OrcStruct realRow; + + OriginalReaderPair(ReaderKey key, Reader reader, int bucket, + RecordIdentifier minKey, RecordIdentifier maxKey, + long offset, boolean[] include, SearchArgument sarg, + String[] columnNames) throws IOException { + super(key, reader, bucket, minKey, maxKey, offset, trimInclude(include), + sarg, columnNames); + } + + static boolean[] trimInclude(boolean[] include) { + return Arrays.copyOfRange(include, OrcRecordUpdater.ROW + 1, + include.length); + } + + void next() throws IOException { + if (recordReader.hasNext()) { + long nextRowId = recordReader.getRowNumber(); + realRow = (OrcStruct) recordReader.next(realRow); + // have to do initialization here, because the super's constructor + // calls next and thus we need to initialize before our constructor + // runs + if (nextRecord == null) { + nextRecord = new OrcStruct(OrcRecordUpdater.FIELDS); + IntWritable operation = + new IntWritable(OrcRecordUpdater.INSERT_OPERATION); + nextRecord.setFieldValue(OrcRecordUpdater.OPERATION, operation); + nextRecord.setFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION, + new LongWritable(0)); + nextRecord.setFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION, + new LongWritable(0)); + nextRecord.setFieldValue(OrcRecordUpdater.BUCKET, + new IntWritable(bucket)); + this.rowId = new LongWritable(); + nextRecord.setFieldValue(OrcRecordUpdater.ROW_ID, rowId); + key.setValues(0L, bucket, nextRowId, 0L); + } + nextRecord.setFieldValue(OrcRecordUpdater.ROW, realRow); + rowId.set(nextRowId); + key.setRowId(nextRowId); + if (maxKey != null && key.compareRow(maxKey) > 0) { + nextRecord = null; + recordReader.close(); + } + } else { + nextRecord = null; + recordReader.close(); + } + } + } + + private final TreeMap readers = + new TreeMap(); + private Map.Entry primary; + private ReaderKey secondary = null; + + /** + * Find the key range for original bucket files. + * @param reader the reader + * @param bucket the bucket number we are reading + * @param offset the starting offset + * @param length the number of bytes to read + * @throws IOException + */ + private void discoverOriginalKeyBounds(Reader reader, + int bucket, + long offset, + long length) throws IOException { + long rowLength = 0; + long rowOffset = 0; + long maxOffset = offset + length; + boolean isTail = true; + for(StripeInformation stripe: reader.getStripes()) { + if (offset < stripe.getOffset()) { + rowOffset += stripe.getNumberOfRows(); + } else if (maxOffset <= stripe.getOffset()) { + isTail = false; + break; + } else { + rowLength += stripe.getNumberOfRows(); + } + } + if (rowOffset > 0) { + minKey = new RecordIdentifier(0, bucket, rowOffset - 1); + } + if (!isTail) { + maxKey = new RecordIdentifier(0, bucket, rowOffset + rowLength - 1); + } + } + + private void discoverKeyBounds(Reader reader, + long offset, + long length) throws IOException { + RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader); + long maxOffset = offset + length; + int firstStripe = 0; + int stripeCount = 0; + boolean isTail = true; + List stripes = reader.getStripes(); + for(StripeInformation stripe: stripes) { + if (offset > stripe.getOffset()) { + firstStripe += 1; + } else if (maxOffset > stripe.getOffset()) { + stripeCount += 1; + } else { + isTail = false; + break; + } + } + if (firstStripe != 0) { + minKey = keyIndex[firstStripe - 1]; + } + if (!isTail) { + maxKey = keyIndex[firstStripe + stripeCount - 1]; + } + } + + /** + * Create a reader that merge sorts the ACID events together. + * @param conf the configuration + * @param collapseEvents should the events on the same row be collapsed + * @param isOriginal is the base file a pre-acid file + * @param bucket the bucket we are reading + * @param offset the offset into the base file + * @param length the number of bytes to read in the base file + * @param include the columns to be included + * @param sarg the pushdown search arguments + * @param columnNames the column names for the search down arguments + * @param deltaDirectory the list of delta directories to include + * @throws IOException + */ + OrcRawRecordMerger(Configuration conf, + boolean collapseEvents, + Reader reader, + boolean isOriginal, + int bucket, + long offset, + long length, + boolean[] include, + SearchArgument sarg, + String[] columnNames, + Path... deltaDirectory) throws IOException { + this.conf = conf; + this.collapse = collapseEvents; + this.offset = offset; + this.length = length; + if (reader == null) { + baseReader = null; + } else { + ReaderPair pair; + ReaderKey key = new ReaderKey(); + if (isOriginal) { + discoverOriginalKeyBounds(reader, bucket, offset, length); + pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey, + offset, include, sarg, columnNames); + } else { + discoverKeyBounds(reader, offset, length); + pair = new ReaderPair(key, reader, bucket, minKey, maxKey, offset, + include, sarg, columnNames); + } + if (pair.nextRecord != null) { + readers.put(key, pair); + } + baseReader = pair.recordReader; + } + for(Path delta: deltaDirectory) { + ReaderKey key = new ReaderKey(); + Path deltaFile = AcidUtils.createBucketFile(delta, bucket); + Reader deltaReader = OrcFile.createReader(deltaFile, + OrcFile.readerOptions(conf)); + ReaderPair deltaPair = new ReaderPair(key, deltaReader, bucket, minKey, + maxKey, include); + if (deltaPair.nextRecord != null) { + readers.put(key, deltaPair); + } + } + primary = readers.pollFirstEntry(); + if (readers.isEmpty()) { + secondary = null; + } else { + secondary = readers.firstKey(); + } + } + + RecordIdentifier getMinKey() { + return minKey; + } + + RecordIdentifier getMaxKey() { + return maxKey; + } + + @Override + public ObjectInspector getObjectInspector() { + // Read the configuration parameters + String columnNameProperty = conf.get("columns"); + // NOTE: if "columns.types" is missing, all columns will be of String type + String columnTypeProperty = conf.get("columns.types"); + + // Parse the configuration parameters + ArrayList columnNames = new ArrayList(); + if (columnNameProperty != null && columnNameProperty.length() > 0) { + Collections.addAll(columnNames, columnNameProperty.split(",")); + } + if (columnTypeProperty == null) { + // Default type: all string + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < columnNames.size(); i++) { + if (i > 0) { + sb.append(":"); + } + sb.append("string"); + } + columnTypeProperty = sb.toString(); + } + + ArrayList fieldTypes = + TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + StructTypeInfo rowType = new StructTypeInfo(); + rowType.setAllStructFieldNames(columnNames); + rowType.setAllStructFieldTypeInfos(fieldTypes); + return OrcRecordUpdater.createEventSchema + (OrcStruct.createObjectInspector(rowType)); + } + + @Override + public boolean next(RecordIdentifier recordIdentifier, + OrcStruct prev) throws IOException { + boolean keysSame = true; + while (keysSame && primary != null) { + ReaderPair pair = primary.getValue(); + prev.swapFields(pair.nextRecord); + recordIdentifier.set(pair.key); + pair.next(); + if (pair.nextRecord == null) { + primary = readers.pollFirstEntry(); + if (readers.isEmpty()) { + secondary = null; + } else { + secondary = readers.firstKey(); + } + } else { + ReaderKey key = primary.getKey(); + if (key.compareTo(secondary) > 0) { + readers.put(key, pair); + primary = readers.pollFirstEntry(); + if (readers.isEmpty()) { + secondary = null; + } else { + secondary = readers.firstKey(); + } + } + } + // if we are collapsing, figure out if this is a new row + if (collapse) { + keysSame = prevKey.compareRow(recordIdentifier) == 0; + if (!keysSame) { + prevKey.set(recordIdentifier); + } + } else { + keysSame = false; + } + } + return !keysSame; + } + + @Override + public RecordIdentifier createKey() { + return new ReaderKey(); + } + + @Override + public OrcStruct createValue() { + return new OrcStruct(OrcRecordUpdater.FIELDS); + } + + @Override + public long getPos() throws IOException { + return baseReader == null ? + 0 : offset + (long) (baseReader.getProgress() * length); + } + + @Override + public void close() throws IOException { + for(ReaderPair pair: readers.values()) { + pair.recordReader.close(); + } + } + + @Override + public float getProgress() throws IOException { + return baseReader == null ? 1 : baseReader.getProgress(); + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java new file mode 100644 index 0000000..38bf63f --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcRecordUpdater.java @@ -0,0 +1,289 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.orc; + +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.RecordIdentifier; +import org.apache.hadoop.hive.ql.io.RecordUpdater; +import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; + +import java.util.ArrayList; +import java.util.List; + +/** + * A RecordUpdater where the files are stored as ORC. + */ +public class OrcRecordUpdater implements RecordUpdater { + final static String ACID_KEY_INDEX_NAME = "hive.acid.key.index"; + + final static int INSERT_OPERATION = 0; + final static int UPDATE_OPERATION = 1; + final static int DELETE_OPERATION = 2; + + final static int OPERATION = 0; + final static int CURRENT_TRANSACTION = 1; + final static int ORIGINAL_TRANSACTION = 2; + final static int BUCKET = 3; + final static int ROW_ID = 4; + final static int ROW = 5; + final static int FIELDS = 6; + + final static int DELTA_BUFFER_SIZE = 16 * 1024; + final static long DELTA_STRIPE_SIZE = 16 * 1024 * 1024; + + private final AcidOutputFormat.Options options; + private final Path path; + private final FileSystem fs; + private Writer writer; + private FSDataOutputStream flushLengths = null; + private final OrcStruct item; + private final IntWritable operation = new IntWritable(); + private final LongWritable currentTransaction = new LongWritable(-1); + private final LongWritable originalTransaction = new LongWritable(-1); + private final IntWritable bucket = new IntWritable(); + private final LongWritable rowId = new LongWritable(); + private long insertedRows = 0; + private final KeyIndexBuilder indexBuilder = new KeyIndexBuilder(); + + static Path getSideFile(Path main) { + return new Path(main + "_flush_length"); + } + + static int getOperation(OrcStruct struct) { + return ((IntWritable) struct.getFieldValue(OPERATION)).get(); + } + + static long getCurrentTransaction(OrcStruct struct) { + return ((LongWritable) struct.getFieldValue(CURRENT_TRANSACTION)).get(); + } + + static long getOriginalTransaction(OrcStruct struct) { + return ((LongWritable) struct.getFieldValue(ORIGINAL_TRANSACTION)).get(); + } + + static int getBucket(OrcStruct struct) { + return ((IntWritable) struct.getFieldValue(BUCKET)).get(); + } + + static long getRowId(OrcStruct struct) { + return ((LongWritable) struct.getFieldValue(ROW_ID)).get(); + } + + static OrcStruct getRow(OrcStruct struct) { + return (OrcStruct) struct.getFieldValue(ROW); + } + + /** + * Create an object inspector for the ACID event based on the object inspector + * for the underlying row. + * @param rowInspector the row's object inspector + * @return an object inspector for the event stream + */ + static ObjectInspector createEventSchema(ObjectInspector rowInspector) { + List fields = new ArrayList(); + fields.add(new OrcStruct.Field("operation", + PrimitiveObjectInspectorFactory.writableIntObjectInspector, OPERATION)); + fields.add(new OrcStruct.Field("currentTransaction", + PrimitiveObjectInspectorFactory.writableLongObjectInspector, + CURRENT_TRANSACTION)); + fields.add(new OrcStruct.Field("originalTransaction", + PrimitiveObjectInspectorFactory.writableLongObjectInspector, + ORIGINAL_TRANSACTION)); + fields.add(new OrcStruct.Field("bucket", + PrimitiveObjectInspectorFactory.writableIntObjectInspector, BUCKET)); + fields.add(new OrcStruct.Field("rowId", + PrimitiveObjectInspectorFactory.writableLongObjectInspector, ROW_ID)); + fields.add(new OrcStruct.Field("row", rowInspector, ROW)); + return new OrcStruct.OrcStructInspector(fields); + } + + OrcRecordUpdater(Path path, + AcidOutputFormat.Options options) throws IOException { + this.options = options; + this.path = AcidUtils.createFilename(path, options); + FileSystem fs = options.getFilesystem(); + if (fs == null) { + fs = path.getFileSystem(options.getConfiguration()); + } + this.fs = fs; + OrcFile.WriterOptions writerOptions = + OrcFile.writerOptions(options.getConfiguration()) + .fileSystem(fs) + .callback(indexBuilder); + if (!options.isWritingBase()) { + writerOptions.blockPadding(false); + writerOptions.bufferSize(DELTA_BUFFER_SIZE); + writerOptions.stripeSize(DELTA_STRIPE_SIZE); + } + writerOptions.inspector(createEventSchema(options.getInspector())); + this.writer = OrcFile.createWriter(this.path, writerOptions); + item = new OrcStruct(FIELDS); + item.setFieldValue(OPERATION, operation); + item.setFieldValue(CURRENT_TRANSACTION, currentTransaction); + item.setFieldValue(ORIGINAL_TRANSACTION, originalTransaction); + item.setFieldValue(BUCKET, bucket); + item.setFieldValue(ROW_ID, rowId); + } + + @Override + public void insert(long currentTransaction, int bucket, + Object row) throws IOException { + operation.set(INSERT_OPERATION); + if (this.currentTransaction.get() != currentTransaction) { + this.currentTransaction.set(currentTransaction); + this.originalTransaction.set(currentTransaction); + insertedRows = 0; + } + indexBuilder.addKey(currentTransaction, bucket, insertedRows); + this.rowId.set(insertedRows++); + this.bucket.set(bucket); + item.setFieldValue(OrcRecordUpdater.ROW, row); + writer.addRow(item); + } + + @Override + public void update(long currentTransaction, long originalTransaction, + int bucket, long rowId, + Object row) throws IOException { + operation.set(UPDATE_OPERATION); + this.currentTransaction.set(currentTransaction); + this.originalTransaction.set(originalTransaction); + this.bucket.set(bucket); + this.rowId.set(rowId); + item.setFieldValue(OrcRecordUpdater.ROW, row); + indexBuilder.addKey(originalTransaction, bucket, rowId); + writer.addRow(item); + } + + @Override + public void delete(long currentTransaction, long originalTransaction, + int bucket, long rowId) throws IOException { + operation.set(DELETE_OPERATION); + this.currentTransaction.set(currentTransaction); + this.originalTransaction.set(originalTransaction); + this.bucket.set(bucket); + this.rowId.set(rowId); + item.setFieldValue(OrcRecordUpdater.ROW, null); + indexBuilder.addKey(originalTransaction, bucket, rowId); + writer.addRow(item); + } + + @Override + public void flush() throws IOException { + long len = writer.writeIntermediateFooter(); + if (flushLengths == null) { + flushLengths = fs.create(getSideFile(path), true, 8, + options.getReporter()); + } + flushLengths.writeLong(len); + flushLengths.flush(); + } + + @Override + public void close(boolean abort) throws IOException { + if (abort) { + if (flushLengths == null) { + fs.delete(path, false); + } + } else { + writer.close(); + } + if (flushLengths != null) { + flushLengths.close(); + fs.delete(getSideFile(path), false); + } + writer = null; + } + + @Override + public SerDeStats getStats() { + return null; + } + + private static final Charset utf8 = Charset.forName("UTF-8"); + private static final CharsetDecoder utf8Decoder = utf8.newDecoder(); + + static RecordIdentifier[] parseKeyIndex(Reader reader) { + String[] stripes; + try { + ByteBuffer val = + reader.getMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME) + .duplicate(); + stripes = utf8Decoder.decode(val).toString().split(";"); + } catch (CharacterCodingException e) { + throw new IllegalArgumentException("Bad string encoding for " + + OrcRecordUpdater.ACID_KEY_INDEX_NAME, e); + } + RecordIdentifier[] result = new RecordIdentifier[stripes.length]; + for(int i=0; i < stripes.length; ++i) { + if (stripes[i].length() != 0) { + String[] parts = stripes[i].split(","); + result[i] = new RecordIdentifier(); + result[i].setValues(Long.parseLong(parts[0]), + Integer.parseInt(parts[1]), Long.parseLong(parts[2])); + } + } + return result; + } + + static class KeyIndexBuilder implements OrcFile.WriterCallback { + StringBuilder lastKey = new StringBuilder(); + long lastTransaction; + int lastBucket; + long lastRowId; + + @Override + public void preStripeWrite(OrcFile.WriterContext context + ) throws IOException { + lastKey.append(lastTransaction); + lastKey.append(','); + lastKey.append(lastBucket); + lastKey.append(','); + lastKey.append(lastRowId); + lastKey.append(';'); + } + + @Override + public void preFooterWrite(OrcFile.WriterContext context + ) throws IOException { + context.getWriter().addUserMetadata(OrcRecordUpdater.ACID_KEY_INDEX_NAME, + ByteBuffer.wrap(lastKey.toString().getBytes(utf8))); + } + + void addKey(long transaction, int bucket, long rowId) { + lastTransaction = transaction; + lastBucket = bucket; + lastRowId = rowId; + } + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java index b549b9f..f7a5010 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcSplit.java @@ -22,6 +22,8 @@ import java.io.DataOutput; import java.io.IOException; import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.orc.Reader.FileMetaInfo; @@ -38,19 +40,26 @@ public class OrcSplit extends FileSplit { private Reader.FileMetaInfo fileMetaInfo; private boolean hasFooter; + private boolean isOriginal; + private boolean hasBase; + private final List deltas = new ArrayList(); protected OrcSplit(){ //The FileSplit() constructor in hadoop 0.20 and 1.x is package private so can't use it. //This constructor is used to create the object and then call readFields() // so just pass nulls to this super constructor. - super(null, 0, 0, (String[])null); + super(null, 0, 0, (String[]) null); } public OrcSplit(Path path, long offset, long length, String[] hosts, - FileMetaInfo fileMetaInfo) { + FileMetaInfo fileMetaInfo, boolean isOriginal, boolean hasBase, + List deltas) { super(path, offset, length, hosts); this.fileMetaInfo = fileMetaInfo; hasFooter = this.fileMetaInfo != null; + this.isOriginal = isOriginal; + this.hasBase = hasBase; + this.deltas.addAll(deltas); } @Override @@ -58,9 +67,12 @@ public void write(DataOutput out) throws IOException { //serialize path, offset, length using FileSplit super.write(out); - // Whether footer information follows. - out.writeBoolean(hasFooter); - + int flags = (hasBase ? 4 : 0) | (isOriginal ? 2 : 0) | (hasFooter ? 1 : 0); + out.writeByte(flags); + out.writeInt(deltas.size()); + for(Long delta: deltas) { + out.writeLong(delta); + } if (hasFooter) { // serialize FileMetaInfo fields Text.writeString(out, fileMetaInfo.compressionType); @@ -83,8 +95,16 @@ public void readFields(DataInput in) throws IOException { //deserialize path, offset, length using FileSplit super.readFields(in); - hasFooter = in.readBoolean(); + byte flags = in.readByte(); + hasFooter = (1 & flags) != 0; + isOriginal = (2 & flags) != 0; + hasBase = (4 & flags) != 0; + deltas.clear(); + int numDeltas = in.readInt(); + for(int i=0; i < numDeltas; i++) { + deltas.add(in.readLong()); + } if (hasFooter) { // deserialize FileMetaInfo fields String compressionType = Text.readString(in); @@ -107,4 +127,15 @@ public FileMetaInfo getFileMetaInfo(){ public boolean hasFooter() { return hasFooter; } + + public boolean isOriginal() { + return isOriginal; + } + + public boolean hasBase() { + return hasBase; + } + public List getDeltas() { + return deltas; + } } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java index 226a106..9dadc27 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcStruct.java @@ -77,6 +77,16 @@ public void setNumFields(int numFields) { } } + /** + * Destructively swap the values from one struct to the other. + * @param other the value to swap with + */ + void swapFields(OrcStruct other) { + Object[] tmp = other.fields; + other.fields = fields; + fields = tmp; + } + @Override public void write(DataOutput dataOutput) throws IOException { throw new UnsupportedOperationException("write unsupported"); @@ -169,6 +179,11 @@ public String getFieldComment() { protected OrcStructInspector() { super(); } + + OrcStructInspector(List fields) { + this.fields = fields; + } + OrcStructInspector(StructTypeInfo info) { ArrayList fieldNames = info.getAllStructFieldNames(); ArrayList fieldTypes = info.getAllStructFieldTypeInfos(); diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java index 2bab0ce..914bd20 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/Reader.java @@ -55,7 +55,7 @@ * Get the user metadata keys. * @return the set of metadata keys */ - Iterable getMetadataKeys(); + List getMetadataKeys(); /** * Get a user metadata value. @@ -87,7 +87,7 @@ * Get the list of stripes. * @return the information about the stripes in order */ - Iterable getStripes(); + List getStripes(); /** * Get the object inspector for looking at the objects. @@ -163,9 +163,7 @@ * @param include true for each column that should be included * @return a new RecordReader that will read the specified rows. * @throws IOException - * @deprecated */ - @Deprecated RecordReader rows(long offset, long length, boolean[] include) throws IOException; diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java index a34a6ce..3a07e82 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/ReaderImpl.java @@ -24,7 +24,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; -import java.util.Iterator; import java.util.List; import java.util.Set; @@ -36,7 +35,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; -import org.apache.hadoop.hive.ql.log.PerfLogger; import org.apache.hadoop.hive.ql.util.JavaDataModel; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.io.Text; @@ -68,10 +66,7 @@ // memory footprint. private final ByteBuffer footerByteBuffer; - private static final PerfLogger perfLogger = PerfLogger.getPerfLogger(); - private static final String CLASS_NAME = ReaderImpl.class.getName(); - - private static class StripeInformationImpl + static class StripeInformationImpl implements StripeInformation { private final OrcProto.StripeInformation stripe; @@ -123,7 +118,7 @@ public long getNumberOfRows() { } @Override - public Iterable getMetadataKeys() { + public List getMetadataKeys() { List result = new ArrayList(); for(OrcProto.UserMetadataItem item: footer.getMetadataList()) { result.add(item.getName()); @@ -152,32 +147,12 @@ public int getCompressionSize() { } @Override - public Iterable getStripes() { - return new Iterable(){ - - @Override - public Iterator iterator() { - return new Iterator(){ - private final Iterator inner = - footer.getStripesList().iterator(); - - @Override - public boolean hasNext() { - return inner.hasNext(); - } - - @Override - public org.apache.hadoop.hive.ql.io.orc.StripeInformation next() { - return new StripeInformationImpl(inner.next()); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("remove unsupported"); - } - }; - } - }; + public List getStripes() { + List result = new ArrayList(); + for(OrcProto.StripeInformation info: footer.getStripesList()) { + result.add(new StripeInformationImpl(info)); + } + return result; } @Override @@ -287,22 +262,33 @@ static void checkOrcVersion(Log log, Path path, List version) { } /** - * Constructor that extracts metadata information from file footer - * @param fs - * @param path - * @param conf + * Constructor that let's the user specify additional options. + * @param path pathname for file + * @param options options for reading * @throws IOException */ - ReaderImpl(FileSystem fs, Path path, Configuration conf) throws IOException { + ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException { + FileSystem fs = options.getFilesystem(); + if (fs == null) { + fs = path.getFileSystem(options.getConfiguration()); + } this.fileSystem = fs; this.path = path; - this.conf = conf; - - FileMetaInfo footerMetaData = extractMetaInfoFromFooter(fs, path); - - MetaInfoObjExtractor rInfo = new MetaInfoObjExtractor(footerMetaData.compressionType, - footerMetaData.bufferSize, footerMetaData.metadataSize, footerMetaData.footerBuffer); + this.conf = options.getConfiguration(); + FileMetaInfo footerMetaData; + if (options.getFileMetaInfo() != null) { + footerMetaData = options.getFileMetaInfo(); + } else { + footerMetaData = extractMetaInfoFromFooter(fs, path, + options.getMaxLength()); + } + MetaInfoObjExtractor rInfo = + new MetaInfoObjExtractor(footerMetaData.compressionType, + footerMetaData.bufferSize, + footerMetaData.metadataSize, + footerMetaData.footerBuffer + ); this.footerByteBuffer = footerMetaData.footerBuffer; this.compressionKind = rInfo.compressionKind; this.codec = rInfo.codec; @@ -314,43 +300,14 @@ static void checkOrcVersion(Log log, Path path, List version) { } - /** - * Constructor that takes already saved footer meta information. Used for creating RecordReader - * from saved information in InputSplit - * @param fs - * @param path - * @param fMetaInfo - * @param conf - * @throws IOException - */ - ReaderImpl(FileSystem fs, Path path, FileMetaInfo fMetaInfo, Configuration conf) - throws IOException { - this.fileSystem = fs; - this.path = path; - this.conf = conf; - - MetaInfoObjExtractor rInfo = new MetaInfoObjExtractor( - fMetaInfo.compressionType, - fMetaInfo.bufferSize, - fMetaInfo.metadataSize, - fMetaInfo.footerBuffer - ); - this.footerByteBuffer = fMetaInfo.footerBuffer; - this.compressionKind = rInfo.compressionKind; - this.codec = rInfo.codec; - this.bufferSize = rInfo.bufferSize; - this.metadataSize = rInfo.metadataSize; - this.metadata = rInfo.metadata; - this.footer = rInfo.footer; - this.inspector = rInfo.inspector; - } - - - private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, Path path) throws IOException { + private static FileMetaInfo extractMetaInfoFromFooter(FileSystem fs, + Path path, + long maxFileLength + ) throws IOException { FSDataInputStream file = fs.open(path); //read last bytes into buffer to get PostScript - long size = fs.getFileStatus(path).getLen(); + long size = Math.min(maxFileLength, fs.getFileStatus(path).getLen()); int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS); file.seek(size - readSize); ByteBuffer buffer = ByteBuffer.allocate(readSize); @@ -467,8 +424,6 @@ public FileMetaInfo getFileMetaInfo(){ return new FileMetaInfo(compressionKind.toString(), bufferSize, metadataSize, footerByteBuffer); } - - @Override public RecordReader rows(boolean[] include) throws IOException { return rows(0, Long.MAX_VALUE, include, null, null); @@ -583,7 +538,7 @@ public long getRawDataSizeOfColumns(List colNames) { // index for the requested field int idxStart = type.getSubtypes(fieldIdx); - int idxEnd = 0; + int idxEnd; // if the specified is the last field and then end index will be last // column index diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java index 27a9338..c54d32c 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/VectorizedOrcInputFormat.java @@ -31,7 +31,6 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.InputFormatChecker; -import org.apache.hadoop.hive.ql.io.orc.Reader.FileMetaInfo; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.NullWritable; @@ -60,9 +59,10 @@ VectorizedOrcRecordReader(Reader file, Configuration conf, FileSplit fileSplit) throws IOException { List types = file.getTypes(); - boolean[] includedColumns = OrcInputFormat.findIncludedColumns(types, conf); - String[] columnNames = OrcInputFormat.getIncludedColumnNames(types, includedColumns, conf); - SearchArgument sarg = OrcInputFormat.createSarg(types, conf); + // TODO fix to work with ACID + boolean[] includedColumns = OrcInputFormat.findIncludedColumns(types, conf, true, 0); + String[] columnNames = OrcInputFormat.getIncludedColumnNames(types, includedColumns, conf, 0); + SearchArgument sarg = OrcInputFormat.createSarg(conf); this.offset = fileSplit.getStart(); this.length = fileSplit.getLength(); @@ -145,25 +145,15 @@ public VectorizedOrcInputFormat() { reporter.setStatus(fSplit.toString()); Path path = fSplit.getPath(); - FileSystem fs = path.getFileSystem(conf); - Reader reader = null; - - if(!(fSplit instanceof OrcSplit)){ - //If CombineHiveInputFormat is used, it works with FileSplit and not OrcSplit - reader = OrcFile.createReader(fs, path, conf); - } else { - //We have OrcSplit, which may have footer metadata cached, so use the appropriate reader - //constructor + OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf); + if(fSplit instanceof OrcSplit){ OrcSplit orcSplit = (OrcSplit) fSplit; if (orcSplit.hasFooter()) { - FileMetaInfo fMetaInfo = orcSplit.getFileMetaInfo(); - reader = OrcFile.createReader(fs, path, fMetaInfo, conf); - } else { - reader = OrcFile.createReader(fs, path, conf); + opts.fileMetaInfo(orcSplit.getFileMetaInfo()); } } - + Reader reader = OrcFile.createReader(path, opts); return new VectorizedOrcRecordReader(reader, conf, fSplit); } @@ -176,7 +166,8 @@ public boolean validateInput(FileSystem fs, HiveConf conf, } for (FileStatus file : files) { try { - OrcFile.createReader(fs, file.getPath(), conf); + OrcFile.createReader(file.getPath(), + OrcFile.readerOptions(conf).filesystem(fs)); } catch (IOException e) { return false; } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java index 591a238..c391b0e 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/Writer.java @@ -65,4 +65,11 @@ * @return row count */ long getNumberOfRows(); + + /** + * Write an intermediate footer on the file such that if the file is + * truncated to the returned offset, it would be a valid ORC file. + * @return the offset that would be a valid end location for an ORC file + */ + long writeIntermediateFooter() throws IOException; } diff --git ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java index 7e9bed6..df657fc 100644 --- ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java +++ ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java @@ -118,6 +118,7 @@ private long rowsInStripe = 0; private long rawDataSize = 0; private int rowsInIndex = 0; + private int stripesInLastFlush = -1; private final List stripes = new ArrayList(); private final Map userMetadata = @@ -130,6 +131,8 @@ private final MemoryManager memoryManager; private final OrcFile.Version version; private final Configuration conf; + private final OrcFile.WriterCallback callback; + private final OrcFile.WriterContext callbackContext; WriterImpl(FileSystem fs, Path path, @@ -141,10 +144,23 @@ int rowIndexStride, MemoryManager memoryManager, boolean addBlockPadding, - OrcFile.Version version) throws IOException { + OrcFile.Version version, + OrcFile.WriterCallback callback) throws IOException { this.fs = fs; this.path = path; this.conf = conf; + this.callback = callback; + if (callback != null) { + callbackContext = new OrcFile.WriterContext(){ + + @Override + public Writer getWriter() { + return WriterImpl.this; + } + }; + } else { + callbackContext = null; + } this.stripeSize = stripeSize; this.version = version; this.addBlockPadding = addBlockPadding; @@ -1751,7 +1767,9 @@ private void flushStripe() throws IOException { createRowIndexEntry(); } if (rowsInStripe != 0) { - + if (callback != null) { + callback.preStripeWrite(callbackContext); + } // finalize the data for the stripe int requiredIndexEntries = rowIndexStride == 0 ? 0 : (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride); @@ -2010,6 +2028,9 @@ public void addRow(Object row) throws IOException { @Override public void close() throws IOException { + if (callback != null) { + callback.preFooterWrite(callbackContext); + } // remove us from the memory manager so that we don't get any callbacks memoryManager.removeWriter(path); // actually close the file @@ -2039,4 +2060,19 @@ public long getRawDataSize() { public long getNumberOfRows() { return rowCount; } + + @Override + public synchronized long writeIntermediateFooter() throws IOException { + // flush any buffered rows + flushStripe(); + // write a footer + if (stripesInLastFlush != stripes.size()) { + int metaLength = writeMetadata(rawWriter.getPos()); + int footLength = writeFooter(rawWriter.getPos() - metaLength); + rawWriter.writeByte(writePostScript(footLength, metaLength)); + stripesInLastFlush = stripes.size(); + rawWriter.flush(); + } + return rawWriter.getPos(); + } } diff --git ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java new file mode 100644 index 0000000..30957f7 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java @@ -0,0 +1,325 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.metastore.api.GetOpenTxnsResponse; +import org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat; +import org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockFile; +import org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockFileSystem; +import org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockPath; +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TestAcidUtils { + + @Test + public void testCreateFilename() throws Exception { + Path p = new Path("/tmp"); + Configuration conf = new Configuration(); + AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf) + .setOldStyle(true).bucket(1); + assertEquals("/tmp/00001_0", + AcidUtils.createFilename(p, options).toString()); + options.bucket(123); + assertEquals("/tmp/00123_0", + AcidUtils.createFilename(p, options).toString()); + options.bucket(23) + .minimumTransactionId(100) + .maximumTransactionId(200) + .writingBase(true) + .setOldStyle(false); + assertEquals("/tmp/base_0000200/bucket_00023", + AcidUtils.createFilename(p, options).toString()); + options.writingBase(false); + assertEquals("/tmp/delta_0000100_0000200/bucket_00023", + AcidUtils.createFilename(p, options).toString()); + } + + @Test + public void testParsing() throws Exception { + assertEquals(123, AcidUtils.parseBase(new Path("/tmp/base_000123"))); + Path dir = new Path("/tmp/tbl"); + Configuration conf = new Configuration(); + AcidOutputFormat.Options opts = + AcidUtils.parseBaseBucketFilename(new Path(dir, "base_567/bucket_123"), + conf); + assertEquals(false, opts.getOldStyle()); + assertEquals(true, opts.isWritingBase()); + assertEquals(567, opts.getMaximumTransactionId()); + assertEquals(0, opts.getMinimumTransactionId()); + assertEquals(123, opts.getBucket()); + opts = AcidUtils.parseBaseBucketFilename(new Path(dir, "000123_0"), conf); + assertEquals(true, opts.getOldStyle()); + assertEquals(true, opts.isWritingBase()); + assertEquals(123, opts.getBucket()); + assertEquals(0, opts.getMinimumTransactionId()); + assertEquals(0, opts.getMaximumTransactionId()); + } + + public static class MockTransactionList + implements IMetaStoreClient.ValidTxnList { + private final long[] openTxns; + private final long maxTxn; + + public MockTransactionList(long maxTxn, long... openTxns) { + this.maxTxn = maxTxn; + this.openTxns = openTxns; + } + + @Override + public boolean isTxnCommitted(long txnid) { + if (txnid > maxTxn) { + return false; + } + for(long txn: openTxns) { + if (txn == txnid) { + return false; + } + } + return true; + } + + @Override + public RangeResponse isTxnRangeCommitted(long minTxnId, long maxTxnId) { + if (minTxnId > maxTxn) { + return RangeResponse.NONE; + } + long count = 0; + for(long txn: openTxns) { + if (minTxnId <= txn && txn <= maxTxnId) { + count += 1; + } + } + if (count == 0) { + if (maxTxn >= maxTxnId) { + return RangeResponse.ALL; + } else { + return RangeResponse.SOME; + } + } else if (count == (maxTxnId - minTxnId + 1)) { + return RangeResponse.NONE; + } else { + return RangeResponse.SOME; + } + } + + @Override + public GetOpenTxnsResponse getOpenTxns() { + return null; + } + + @Override + public void fromString(String src) { + // TODO don't think this is needed + } + } + + @Test + public void testOriginal() throws Exception { + Configuration conf = new Configuration(); + MockFileSystem fs = new MockFileSystem(conf, + new MockFile("/tbl/part1/000000_0", 500, new byte[0]), + new MockFile("/tbl/part1/000001_1", 500, new byte[0]), + new MockFile("/tbl/part1/000002_0", 500, new byte[0]), + new MockFile("/tbl/part1/random", 500, new byte[0]), + new MockFile("/tbl/part1/_done", 0, new byte[0]), + new MockFile("/tbl/part1/subdir/000000_0", 0, new byte[0])); + AcidUtils.Directory dir = + AcidUtils.getAcidState(new MockPath(fs, "/tbl/part1"), conf, + new MockTransactionList(100)); + assertEquals(null, dir.getBaseDirectory()); + assertEquals(0, dir.getCurrentDirectories().size()); + assertEquals(0, dir.getObsolete().size()); + List result = dir.getOriginalFiles(); + assertEquals(5, result.size()); + assertEquals("/tbl/part1/000000_0", result.get(0).getPath().toString()); + assertEquals("/tbl/part1/000001_1", result.get(1).getPath().toString()); + assertEquals("/tbl/part1/000002_0", result.get(2).getPath().toString()); + assertEquals("/tbl/part1/random", result.get(3).getPath().toString()); + assertEquals("/tbl/part1/subdir/000000_0", result.get(4).getPath().toString()); + } + + @Test + public void testOriginalDeltas() throws Exception { + Configuration conf = new Configuration(); + MockFileSystem fs = new MockFileSystem(conf, + new MockFile("/tbl/part1/000000_0", 500, new byte[0]), + new MockFile("/tbl/part1/000001_1", 500, new byte[0]), + new MockFile("/tbl/part1/000002_0", 500, new byte[0]), + new MockFile("/tbl/part1/random", 500, new byte[0]), + new MockFile("/tbl/part1/_done", 0, new byte[0]), + new MockFile("/tbl/part1/subdir/000000_0", 0, new byte[0]), + new MockFile("/tbl/part1/delta_025_025/bucket_0", 0, new byte[0]), + new MockFile("/tbl/part1/delta_029_029/bucket_0", 0, new byte[0]), + new MockFile("/tbl/part1/delta_025_030/bucket_0", 0, new byte[0]), + new MockFile("/tbl/part1/delta_050_100/bucket_0", 0, new byte[0]), + new MockFile("/tbl/part1/delta_101_101/bucket_0", 0, new byte[0])); + AcidUtils.Directory dir = + AcidUtils.getAcidState(new TestInputOutputFormat.MockPath(fs, + "/tbl/part1"), conf, new MockTransactionList(100)); + assertEquals(null, dir.getBaseDirectory()); + List obsolete = dir.getObsolete(); + assertEquals(2, obsolete.size()); + assertEquals("/tbl/part1/delta_025_025", + obsolete.get(0).getPath().toString()); + assertEquals("/tbl/part1/delta_029_029", + obsolete.get(1).getPath().toString()); + List result = dir.getOriginalFiles(); + assertEquals(5, result.size()); + assertEquals("/tbl/part1/000000_0", result.get(0).getPath().toString()); + assertEquals("/tbl/part1/000001_1", result.get(1).getPath().toString()); + assertEquals("/tbl/part1/000002_0", result.get(2).getPath().toString()); + assertEquals("/tbl/part1/random", result.get(3).getPath().toString()); + assertEquals("/tbl/part1/subdir/000000_0", result.get(4).getPath().toString()); + List deltas = dir.getCurrentDirectories(); + assertEquals(2, deltas.size()); + AcidUtils.ParsedDelta delt = deltas.get(0); + assertEquals("/tbl/part1/delta_025_030", delt.getPath().toString()); + assertEquals(25, delt.getMinTransaction()); + assertEquals(30, delt.getMaxTransaction()); + delt = deltas.get(1); + assertEquals("/tbl/part1/delta_050_100", delt.getPath().toString()); + assertEquals(50, delt.getMinTransaction()); + assertEquals(100, delt.getMaxTransaction()); + } + + @Test + public void testBaseDeltas() throws Exception { + Configuration conf = new Configuration(); + MockFileSystem fs = new MockFileSystem(conf, + new MockFile("/tbl/part1/base_5/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/base_10/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/base_49/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/delta_025_025/bucket_0", 0, new byte[0]), + new MockFile("/tbl/part1/delta_029_029/bucket_0", 0, new byte[0]), + new MockFile("/tbl/part1/delta_025_030/bucket_0", 0, new byte[0]), + new MockFile("/tbl/part1/delta_050_105/bucket_0", 0, new byte[0]), + new MockFile("/tbl/part1/delta_90_120/bucket_0", 0, new byte[0])); + AcidUtils.Directory dir = + AcidUtils.getAcidState(new TestInputOutputFormat.MockPath(fs, + "/tbl/part1"), conf, new MockTransactionList(100)); + assertEquals("/tbl/part1/base_49", dir.getBaseDirectory().toString()); + List obsolete = dir.getObsolete(); + assertEquals(5, obsolete.size()); + assertEquals("/tbl/part1/base_10", obsolete.get(0).getPath().toString()); + assertEquals("/tbl/part1/base_5", obsolete.get(1).getPath().toString()); + assertEquals("/tbl/part1/delta_025_030", obsolete.get(2).getPath().toString()); + assertEquals("/tbl/part1/delta_025_025", obsolete.get(3).getPath().toString()); + assertEquals("/tbl/part1/delta_029_029", obsolete.get(4).getPath().toString()); + assertEquals(0, dir.getOriginalFiles().size()); + List deltas = dir.getCurrentDirectories(); + assertEquals(1, deltas.size()); + AcidUtils.ParsedDelta delt = deltas.get(0); + assertEquals("/tbl/part1/delta_050_105", delt.getPath().toString()); + assertEquals(50, delt.getMinTransaction()); + assertEquals(105, delt.getMaxTransaction()); + } + + @Test + public void testBestBase() throws Exception { + Configuration conf = new Configuration(); + MockFileSystem fs = new MockFileSystem(conf, + new MockFile("/tbl/part1/base_5/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/base_10/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/base_25/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/base_100/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/base_200/bucket_0", 500, new byte[0])); + Path part = new MockPath(fs, "/tbl/part1"); + AcidUtils.Directory dir = + AcidUtils.getAcidState(part, conf, new MockTransactionList(150)); + assertEquals("/tbl/part1/base_100", dir.getBaseDirectory().toString()); + List obsoletes = dir.getObsolete(); + assertEquals(3, obsoletes.size()); + assertEquals("/tbl/part1/base_10", obsoletes.get(0).getPath().toString()); + assertEquals("/tbl/part1/base_25", obsoletes.get(1).getPath().toString()); + assertEquals("/tbl/part1/base_5", obsoletes.get(2).getPath().toString()); + assertEquals(0, dir.getOriginalFiles().size()); + assertEquals(0, dir.getCurrentDirectories().size()); + dir = AcidUtils.getAcidState(part, conf, new MockTransactionList(150, 99)); + assertEquals("/tbl/part1/base_25", dir.getBaseDirectory().toString()); + obsoletes = dir.getObsolete(); + assertEquals(2, obsoletes.size()); + assertEquals("/tbl/part1/base_10", obsoletes.get(0).getPath().toString()); + assertEquals("/tbl/part1/base_5", obsoletes.get(1).getPath().toString()); + dir = AcidUtils.getAcidState(part, conf, new MockTransactionList(150, 25)); + assertEquals("/tbl/part1/base_10", dir.getBaseDirectory().toString()); + obsoletes = dir.getObsolete(); + assertEquals(1, obsoletes.size()); + assertEquals("/tbl/part1/base_5", obsoletes.get(0).getPath().toString()); + try { + dir = AcidUtils.getAcidState(part, conf, new MockTransactionList(150, 2)); + assertTrue(false); + } catch (IllegalArgumentException iae) { + //expected + } + } + + @Test + public void testObsoleteOriginals() throws Exception { + Configuration conf = new Configuration(); + MockFileSystem fs = new MockFileSystem(conf, + new MockFile("/tbl/part1/base_10/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/base_5/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/000000_0", 500, new byte[0]), + new MockFile("/tbl/part1/000001_1", 500, new byte[0])); + Path part = new MockPath(fs, "/tbl/part1"); + AcidUtils.Directory dir = + AcidUtils.getAcidState(part, conf, new MockTransactionList(150)); + List obsolete = dir.getObsolete(); + assertEquals(3, obsolete.size()); + assertEquals("/tbl/part1/base_5", obsolete.get(0).getPath().toString()); + assertEquals("/tbl/part1/000000_0", obsolete.get(1).getPath().toString()); + assertEquals("/tbl/part1/000001_1", obsolete.get(2).getPath().toString()); + assertEquals("/tbl/part1/base_10", dir.getBaseDirectory().toString()); + } + + @Test + public void testOverlapingDelta() throws Exception { + Configuration conf = new Configuration(); + MockFileSystem fs = new MockFileSystem(conf, + new MockFile("/tbl/part1/delta_0000063_63/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/delta_000062_62/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/delta_00061_61/bucket_0", 500, new byte[0]), new MockFile("/tbl/part1/delta_40_60/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/delta_0060_60/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/delta_052_55/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/delta_40_60/bucket_0", 500, new byte[0]), + new MockFile("/tbl/part1/base_50/bucket_0", 500, new byte[0])); + Path part = new MockPath(fs, "/tbl/part1"); + AcidUtils.Directory dir = + AcidUtils.getAcidState(part, conf, new MockTransactionList(100)); + assertEquals("/tbl/part1/base_50", dir.getBaseDirectory().toString()); + List obsolete = dir.getObsolete(); + assertEquals(2, obsolete.size()); + assertEquals("/tbl/part1/delta_052_55", obsolete.get(0).getPath().toString()); + assertEquals("/tbl/part1/delta_0060_60", obsolete.get(1).getPath().toString()); + List delts = dir.getCurrentDirectories(); + assertEquals(4, delts.size()); + assertEquals("/tbl/part1/delta_40_60", delts.get(0).getPath().toString()); + assertEquals("/tbl/part1/delta_00061_61", delts.get(1).getPath().toString()); + assertEquals("/tbl/part1/delta_000062_62", delts.get(2).getPath().toString()); + assertEquals("/tbl/part1/delta_0000063_63", delts.get(3).getPath().toString()); + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/io/TestRecordIdentifier.java ql/src/test/org/apache/hadoop/hive/ql/io/TestRecordIdentifier.java new file mode 100644 index 0000000..6d83f70 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/io/TestRecordIdentifier.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io; + +import org.junit.Test; + +import static org.junit.Assert.assertTrue; + +public class TestRecordIdentifier { + @Test + public void TestOrdering() throws Exception { + RecordIdentifier left = new RecordIdentifier(100, 200, 1200); + RecordIdentifier right = new RecordIdentifier(); + right.setValues(100L, 200, 1000L); + assertTrue(right.compareTo(left) < 0); + assertTrue(left.compareTo(right) > 0); + left.set(right); + assertTrue(right.compareTo(left) == 0); + right.setRowId(2000); + assertTrue(right.compareTo(left) > 0); + left.setValues(1, 2, 3); + right.setValues(100, 2, 3); + assertTrue(left.compareTo(right) < 0); + assertTrue(right.compareTo(left) > 0); + left.setValues(1, 2, 3); + right.setValues(1, 100, 3); + assertTrue(left.compareTo(right) < 0); + assertTrue(right.compareTo(left) > 0); + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java index 7552b2c..d6b0573 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestInputOutputFormat.java @@ -26,11 +26,12 @@ import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; -import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Properties; +import java.util.Set; +import java.util.TreeSet; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; @@ -56,7 +57,6 @@ import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; @@ -160,7 +160,7 @@ public void testFileGenerator() throws Exception { new MockFile("/a/b/.part-03", 1000, new byte[0]), new MockFile("/a/b/part-04", 1000, new byte[0])); OrcInputFormat.FileGenerator gen = - new OrcInputFormat.FileGenerator(context, fs, new Path("/a/b")); + new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "/a/b")); gen.run(); if (context.getErrors().size() > 0) { for(Throwable th: context.getErrors()) { @@ -178,26 +178,25 @@ public void testFileGenerator() throws Exception { ((OrcInputFormat.SplitGenerator) context.queue.get(2)).getPath()); } - static final Charset UTF8 = Charset.forName("UTF-8"); - - static class MockBlock { + public static class MockBlock { int offset; int length; final String[] hosts; - MockBlock(String... hosts) { + public MockBlock(String... hosts) { this.hosts = hosts; } } - static class MockFile { + public static class MockFile { final Path path; final int blockSize; final int length; final MockBlock[] blocks; final byte[] content; - MockFile(String path, int blockSize, byte[] content, MockBlock... blocks) { + public MockFile(String path, int blockSize, byte[] content, + MockBlock... blocks) { this.path = new Path(path); this.blockSize = blockSize; this.blocks = blocks; @@ -244,11 +243,23 @@ public int read() throws IOException { } } - static class MockFileSystem extends FileSystem { + public static class MockPath extends Path { + private final FileSystem fs; + public MockPath(FileSystem fs, String path) { + super(path); + this.fs = fs; + } + @Override + public FileSystem getFileSystem(Configuration conf) { + return fs; + } + } + + public static class MockFileSystem extends FileSystem { final MockFile[] files; Path workingDir = new Path("/"); - MockFileSystem(Configuration conf, MockFile... files) { + public MockFileSystem(Configuration conf, MockFile... files) { setConf(conf); this.files = files; } @@ -305,11 +316,29 @@ public boolean delete(Path path, boolean b) throws IOException { @Override public FileStatus[] listStatus(Path path) throws IOException { List result = new ArrayList(); + String pathname = path.toString(); + String pathnameAsDir = pathname + "/"; + Set dirs = new TreeSet(); for(MockFile file: files) { - if (file.path.getParent().equals(path)) { - result.add(getFileStatus(file.path)); + String filename = file.path.toString(); + if (pathname.equals(filename)) { + return new FileStatus[]{createStatus(file)}; + } else if (filename.startsWith(pathnameAsDir)) { + String tail = filename.substring(pathnameAsDir.length()); + int nextSlash = tail.indexOf('/'); + if (nextSlash > 0) { + dirs.add(tail.substring(0, nextSlash)); + } else { + result.add(createStatus(file)); + } } } + // for each directory add it once + for(String dir: dirs) { + result.add(new FileStatus(0, true, 0, 0, 0, 0, + FsPermission.createImmutable((short) 755), "owen", "group", + new MockPath(this, pathnameAsDir + dir))); + } return result.toArray(new FileStatus[result.size()]); } @@ -328,13 +357,17 @@ public boolean mkdirs(Path path, FsPermission fsPermission) { return false; } + private FileStatus createStatus(MockFile file) { + return new FileStatus(file.length, false, 1, file.blockSize, 0, 0, + FsPermission.createImmutable((short) 644), "owen", "group", + file.path); + } + @Override public FileStatus getFileStatus(Path path) throws IOException { for(MockFile file: files) { if (file.path.equals(path)) { - return new FileStatus(file.length, false, 1, file.blockSize, 0, 0, - FsPermission.createImmutable((short) 644), "owen", "group", - file.path); + return createStatus(file); } } return null; @@ -426,9 +459,10 @@ public void testAddSplit() throws Exception { OrcInputFormat.Context context = new OrcInputFormat.Context(conf); OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(context, fs, - fs.getFileStatus(new Path("/a/file")), null); + fs.getFileStatus(new Path("/a/file")), null, true, + new ArrayList(), true); splitter.createSplit(0, 200, null); - OrcInputFormat.Context.FileSplitInfo result = context.getResult(-1); + OrcSplit result = context.getResult(-1); assertEquals(0, result.getStart()); assertEquals(200, result.getLength()); assertEquals("/a/file", result.getPath().toString()); @@ -469,7 +503,8 @@ public void testSplitGenerator() throws Exception { OrcInputFormat.Context context = new OrcInputFormat.Context(conf); OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(context, fs, - fs.getFileStatus(new Path("/a/file")), null); + fs.getFileStatus(new Path("/a/file")), null, true, + new ArrayList(), true); splitter.run(); if (context.getErrors().size() > 0) { for(Throwable th: context.getErrors()) { @@ -477,7 +512,7 @@ public void testSplitGenerator() throws Exception { } throw new IOException("Errors during splitting"); } - OrcInputFormat.Context.FileSplitInfo result = context.getResult(0); + OrcSplit result = context.getResult(0); assertEquals(3, result.getStart()); assertEquals(497, result.getLength()); result = context.getResult(1); @@ -497,7 +532,8 @@ public void testSplitGenerator() throws Exception { conf.setInt(OrcInputFormat.MAX_SPLIT_SIZE, 0); context = new OrcInputFormat.Context(conf); splitter = new OrcInputFormat.SplitGenerator(context, fs, - fs.getFileStatus(new Path("/a/file")), null); + fs.getFileStatus(new Path("/a/file")), null, true, new ArrayList(), + true); splitter.run(); if (context.getErrors().size() > 0) { for(Throwable th: context.getErrors()) { @@ -512,6 +548,7 @@ public void testSplitGenerator() throws Exception { } @Test + @SuppressWarnings("unchecked,deprecation") public void testInOutFormat() throws Exception { Properties properties = new Properties(); StructObjectInspector inspector; @@ -625,8 +662,8 @@ public void readFields(DataInput dataInput) throws IOException { } @Test + @SuppressWarnings("unchecked,deprecation") public void testMROutput() throws Exception { - JobConf job = new JobConf(conf); Properties properties = new Properties(); StructObjectInspector inspector; synchronized (TestOrcFile.class) { @@ -682,8 +719,8 @@ public void testMROutput() throws Exception { } @Test + @SuppressWarnings("deprecation") public void testEmptyFile() throws Exception { - JobConf job = new JobConf(conf); Properties properties = new Properties(); HiveOutputFormat outFormat = new OrcOutputFormat(); FSRecordWriter writer = @@ -720,8 +757,8 @@ public void readFields(DataInput dataInput) throws IOException { } @Test + @SuppressWarnings("unchecked,deprecation") public void testDefaultTypes() throws Exception { - JobConf job = new JobConf(conf); Properties properties = new Properties(); StructObjectInspector inspector; synchronized (TestOrcFile.class) { diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewInputOutputFormat.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewInputOutputFormat.java index b9899f7..d95b3c7 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewInputOutputFormat.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewInputOutputFormat.java @@ -191,7 +191,8 @@ public void testNewOutputFormat() throws Exception { Path outputFilePath = new Path(outputPath, "part-m-00000"); assertTrue(localFs.exists(outputFilePath)); - Reader reader = OrcFile.createReader(localFs, outputFilePath, conf); + Reader reader = OrcFile.createReader(outputFilePath, + OrcFile.readerOptions(conf).filesystem(localFs)); assertTrue(reader.getNumberOfRows() == rownum); assertEquals(reader.getCompression(), CompressionKind.ZLIB); StructObjectInspector soi = @@ -248,7 +249,8 @@ public void testNewOutputFormatWithCompression() throws Exception { assertTrue(result); Path outputFilePath = new Path(outputPath, "part-m-00000"); - Reader reader = OrcFile.createReader(localFs, outputFilePath, conf); + Reader reader = OrcFile.createReader(outputFilePath, + OrcFile.readerOptions(conf).filesystem(localFs)); assertEquals(reader.getCompression(), CompressionKind.SNAPPY); localFs.delete(outputPath, true); @@ -351,7 +353,8 @@ public void testNewOutputFormatComplex() throws Exception { assertTrue(result); Path outputFilePath = new Path(outputPath, "part-r-00000"); - Reader reader = OrcFile.createReader(localFs, outputFilePath, conf); + Reader reader = OrcFile.createReader(outputFilePath, + OrcFile.readerOptions(conf).filesystem(localFs)); RecordReader rows = reader.rows(null); ObjectInspector orcOi = reader.getObjectInspector(); @@ -362,36 +365,36 @@ public void testNewOutputFormatComplex() throws Exception { Object row = rows.next(null); List converted = (List)converter.convert(row); - assertEquals(converted.get(0), 1); - assertEquals(converted.get(1), 1); + assertEquals(1, converted.get(0)); + assertEquals(1, converted.get(1)); List list = (List)converted.get(2); assertEquals(list.size(), 1); - assertEquals(((List)list.get(0)).get(0), "saving"); - assertEquals(((List)list.get(0)).get(1), 6); + assertEquals("saving", ((List)list.get(0)).get(0)); + assertEquals(6, ((List)list.get(0)).get(1)); Map map = (Map)converted.get(3); assertEquals(map.size(), 1); assertEquals(map.get("saving"), new Integer(1)); row = rows.next(null); converted = (List)converter.convert(row); - assertEquals(converted.get(0), 2); - assertEquals(converted.get(1), 6); + assertEquals(2, converted.get(0)); + assertEquals(6, converted.get(1)); list = (List)converted.get(2); assertEquals(list.size(), 6); - assertEquals(((List)list.get(0)).get(0), "plums"); - assertEquals(((List)list.get(0)).get(1), 5); + assertEquals("plums", ((List)list.get(0)).get(0)); + assertEquals(5, ((List)list.get(0)).get(1)); map = (Map)converted.get(3); assertEquals(map.size(), 11); assertEquals(map.get("the"), new Integer(2)); row = rows.next(null); converted = (List)converter.convert(row); - assertEquals(converted.get(0), 3); - assertEquals(converted.get(1), 5); + assertEquals(3, converted.get(0)); + assertEquals(5, converted.get(1)); list = (List)converted.get(2); assertEquals(list.size(), 5); - assertEquals(((List)list.get(0)).get(0), "eaten"); - assertEquals(((List)list.get(0)).get(1), 5); + assertEquals("eaten", ((List)list.get(0)).get(0)); + assertEquals(5, ((List)list.get(0)).get(1)); map = (Map)converted.get(3); assertEquals(map.size(), 13); assertEquals(map.get("were"), new Integer(3)); diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java index 8f68acc..d231822 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestNewIntegerEncoding.java @@ -20,13 +20,7 @@ import static junit.framework.Assert.assertEquals; import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.charset.Charset; import java.sql.Timestamp; -import java.util.ArrayList; import java.util.List; import java.util.Random; @@ -65,37 +59,12 @@ public Row(int val, long l) { } } - public List fetchData(String path) throws IOException { - List input = new ArrayList(); - FileInputStream stream = new FileInputStream(new File(path)); - try { - FileChannel fc = stream.getChannel(); - MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size()); - /* Instead of using default, pass in a decoder. */ - String[] lines = Charset.defaultCharset().decode(bb).toString() - .split("\n"); - for(String line : lines) { - long val = 0; - try { - val = Long.parseLong(line); - } catch (NumberFormatException e) { - // for now lets ignore (assign 0) - } - input.add(val); - } - } finally { - stream.close(); - } - return input; - } - Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); Configuration conf; FileSystem fs; Path testFilePath; - String resDir = "ql/src/test/resources"; @Rule public TestName testCaseName = new TestName(); @@ -128,7 +97,8 @@ public void testBasicRow() throws Exception { writer.addRow(new Row(111, 1111L)); writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); while (rows.hasNext()) { Object row = rows.next(null); @@ -162,7 +132,8 @@ public void testBasicOld() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -197,7 +168,8 @@ public void testBasicNew() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -228,7 +200,8 @@ public void testBasicDelta1() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -259,7 +232,8 @@ public void testBasicDelta2() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -290,7 +264,8 @@ public void testBasicDelta3() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -321,7 +296,8 @@ public void testBasicDelta4() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -351,7 +327,8 @@ public void testIntegerMin() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -382,7 +359,8 @@ public void testIntegerMax() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -413,7 +391,8 @@ public void testLongMin() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -444,7 +423,8 @@ public void testLongMax() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -478,7 +458,8 @@ public void testRandomInt() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -512,7 +493,8 @@ public void testRandomLong() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -554,7 +536,8 @@ public void testPatchedBaseNegativeMin() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -596,7 +579,8 @@ public void testPatchedBaseNegativeMin2() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -638,7 +622,8 @@ public void testPatchedBaseNegativeMin3() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -671,7 +656,8 @@ public void testPatchedBaseNegativeMin4() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -706,7 +692,8 @@ public void testPatchedBaseAt0() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -741,7 +728,8 @@ public void testPatchedBaseAt1() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -775,7 +763,8 @@ public void testPatchedBaseAt255() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -809,7 +798,8 @@ public void testPatchedBaseAt256() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -843,7 +833,8 @@ public void testPatchedBase510() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -877,7 +868,8 @@ public void testPatchedBase511() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -908,7 +900,8 @@ public void testPatchedBaseMax1() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -941,7 +934,8 @@ public void testPatchedBaseMax2() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -987,7 +981,8 @@ public void testPatchedBaseMax3() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -1036,7 +1031,8 @@ public void testPatchedBaseMax4() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -1097,7 +1093,8 @@ public void testPatchedBaseTimestamp() throws Exception { writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 0; while (rows.hasNext()) { @@ -1125,7 +1122,8 @@ public void testDirectLargeNegatives() throws Exception { writer.addRow(-5535739865598783616L); writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); Object row = rows.next(null); assertEquals(-7486502418706614742L, ((LongWritable) row).get()); @@ -1164,7 +1162,8 @@ public void testSeek() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); int idx = 55555; rows.seekToRow(idx); diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java index 4d3013d..eb7a59c 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcFile.java @@ -29,6 +29,7 @@ import java.nio.ByteBuffer; import java.sql.Timestamp; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -105,9 +106,7 @@ MiddleStruct(InnerStruct... items) { list.clear(); - for(InnerStruct item: items) { - list.add(item); - } + list.addAll(Arrays.asList(items)); } } @@ -162,9 +161,7 @@ private static InnerStruct inner(int i, String s) { private static List list(InnerStruct... items) { List result = new ArrayList(); - for(InnerStruct s: items) { - result.add(s); - } + result.addAll(Arrays.asList(items)); return result; } @@ -206,8 +203,10 @@ public void openFileSystem () throws Exception { @Test public void testReadFormat_0_11() throws Exception { - Path oldFilePath = new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc")); - Reader reader = OrcFile.createReader(fs, oldFilePath, conf); + Path oldFilePath = + new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc")); + Reader reader = OrcFile.createReader(oldFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); int stripeCount = 0; int rowCount = 0; @@ -470,7 +469,8 @@ public void testStringAndBinaryStatistics() throws Exception { writer.addRow(new SimpleStruct(bytes(0,1,2,3,4,5), null)); writer.addRow(new SimpleStruct(null, "hi")); writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); @@ -565,42 +565,43 @@ public void testStripeLevelStats() throws Exception { } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); Metadata metadata = reader.getMetadata(); int numStripes = metadata.getStripeStatistics().size(); assertEquals(3, numStripes); StripeStatistics ss1 = metadata.getStripeStatistics().get(0); StripeStatistics ss2 = metadata.getStripeStatistics().get(1); StripeStatistics ss3 = metadata.getStripeStatistics().get(2); - assertEquals(4996, ss1.getColumnStatistics()[0].getNumberOfValues()); + assertEquals(5000, ss1.getColumnStatistics()[0].getNumberOfValues()); assertEquals(5000, ss2.getColumnStatistics()[0].getNumberOfValues()); - assertEquals(1004, ss3.getColumnStatistics()[0].getNumberOfValues()); + assertEquals(1000, ss3.getColumnStatistics()[0].getNumberOfValues()); - assertEquals(4996, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getNumberOfValues()); - assertEquals(5000, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getNumberOfValues()); - assertEquals(1004, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getNumberOfValues()); + assertEquals(5000, (ss1.getColumnStatistics()[1]).getNumberOfValues()); + assertEquals(5000, (ss2.getColumnStatistics()[1]).getNumberOfValues()); + assertEquals(1000, (ss3.getColumnStatistics()[1]).getNumberOfValues()); assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMinimum()); - assertEquals(1, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMinimum()); - assertEquals(2, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMinimum()); + assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMinimum()); + assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMinimum()); assertEquals(1, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getMaximum()); assertEquals(2, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getMaximum()); assertEquals(3, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getMaximum()); - assertEquals(4996, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getSum()); - assertEquals(9996, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getSum()); - assertEquals(3008, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getSum()); + assertEquals(5000, ((IntegerColumnStatistics)ss1.getColumnStatistics()[1]).getSum()); + assertEquals(10000, ((IntegerColumnStatistics)ss2.getColumnStatistics()[1]).getSum()); + assertEquals(3000, ((IntegerColumnStatistics)ss3.getColumnStatistics()[1]).getSum()); - assertEquals(4996, ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getNumberOfValues()); - assertEquals(5000, ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getNumberOfValues()); - assertEquals(1004, ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getNumberOfValues()); + assertEquals(5000, (ss1.getColumnStatistics()[2]).getNumberOfValues()); + assertEquals(5000, (ss2.getColumnStatistics()[2]).getNumberOfValues()); + assertEquals(1000, (ss3.getColumnStatistics()[2]).getNumberOfValues()); assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMinimum()); - assertEquals("one", ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getMinimum()); + assertEquals("two", ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getMinimum()); assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMinimum()); assertEquals("one", ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getMaximum()); assertEquals("two", ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getMaximum()); - assertEquals("two", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMaximum()); - assertEquals(14988, ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getSum()); + assertEquals("three", ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getMaximum()); + assertEquals(15000, ((StringColumnStatistics)ss1.getColumnStatistics()[2]).getSum()); assertEquals(15000, ((StringColumnStatistics)ss2.getColumnStatistics()[2]).getSum()); - assertEquals(5012, ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getSum()); + assertEquals(5000, ((StringColumnStatistics)ss3.getColumnStatistics()[2]).getSum()); RecordReaderImpl recordReader = (RecordReaderImpl) reader.rows(null); OrcProto.RowIndex[] index = recordReader.readRowIndex(0); @@ -643,7 +644,8 @@ public void test1() throws Exception { list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), map(inner(5, "chani"), inner(1, "mauddib")))); writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); Metadata metadata = reader.getMetadata(); @@ -719,12 +721,8 @@ public void test1() throws Exception { getStructFieldRef("list").getFieldObjectInspector(); MapObjectInspector ma = (MapObjectInspector) readerInspector. getStructFieldRef("map").getFieldObjectInspector(); - StructObjectInspector lc = (StructObjectInspector) - li.getListElementObjectInspector(); StringObjectInspector mk = (StringObjectInspector) ma.getMapKeyObjectInspector(); - StructObjectInspector mv = (StructObjectInspector) - ma.getMapValueObjectInspector(); RecordReader rows = reader.rows(null); Object row = rows.next(null); assertNotNull(row); @@ -893,7 +891,8 @@ public void columnProjection() throws Exception { writer.addRow(inner(x, y)); } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); // check out the statistics ColumnStatistics[] stats = reader.getStatistics(); @@ -957,7 +956,8 @@ public void emptyFile() throws Exception { .compress(CompressionKind.NONE) .bufferSize(100)); writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(false, reader.rows(null).hasNext()); assertEquals(CompressionKind.NONE, reader.getCompression()); assertEquals(0, reader.getNumberOfRows()); @@ -994,7 +994,8 @@ public void metaData() throws Exception { null, null, null, null)); writer.addUserMetadata("clobber", byteBuf(5,7,11,13,17,19)); writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(byteBuf(5,7,11,13,17,19), reader.getMetadataValue("clobber")); assertEquals(byteBuf(1,2,3,4,5,6,7,-1,-2,127,-128), reader.getMetadataValue("my.meta")); @@ -1088,7 +1089,7 @@ public void testUnionAndTimestamp() throws Exception { if ((i & 1) == 0) { union.set((byte) 0, new IntWritable(i*i)); } else { - union.set((byte) 1, new Text(new Integer(i*i).toString())); + union.set((byte) 1, new Text(Integer.toString(i * i))); } value = HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18)); @@ -1112,7 +1113,8 @@ public void testUnionAndTimestamp() throws Exception { union.set((byte) 0, new IntWritable(138)); writer.addRow(row); writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(false, reader.getMetadataKeys().iterator().hasNext()); assertEquals(5309, reader.getNumberOfRows()); DecimalColumnStatistics stats = @@ -1191,7 +1193,7 @@ public void testUnionAndTimestamp() throws Exception { assertEquals(new IntWritable(i*i), union.getObject()); } else { assertEquals(1, union.getTag()); - assertEquals(new Text(new Integer(i*i).toString()), union.getObject()); + assertEquals(new Text(Integer.toString(i * i)), union.getObject()); } assertEquals(HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18)), row.getFieldValue(2)); @@ -1243,7 +1245,8 @@ public void testSnappy() throws Exception { Integer.toHexString(rand.nextInt()))); } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(null); rand = new Random(12); OrcStruct row = null; @@ -1286,7 +1289,8 @@ public void testWithoutIndex() throws Exception { } } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(50000, reader.getNumberOfRows()); assertEquals(0, reader.getRowIndexStride()); StripeInformation stripe = reader.getStripes().iterator().next(); @@ -1349,8 +1353,8 @@ public void testSeek() throws Exception { byteValues, words, i)); } writer.close(); - writer = null; - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(COUNT, reader.getNumberOfRows()); RecordReader rows = reader.rows(null); OrcStruct row = null; @@ -1369,17 +1373,17 @@ public void testSeek() throws Exception { ((IntWritable) row.getFieldValue(3)).get()); assertEquals(expected.long1.longValue(), ((LongWritable) row.getFieldValue(4)).get()); - assertEquals(expected.float1.floatValue(), + assertEquals(expected.float1, ((FloatWritable) row.getFieldValue(5)).get(), 0.0001); - assertEquals(expected.double1.doubleValue(), + assertEquals(expected.double1, ((DoubleWritable) row.getFieldValue(6)).get(), 0.0001); assertEquals(expected.bytes1, row.getFieldValue(7)); assertEquals(expected.string1, row.getFieldValue(8)); List expectedList = expected.middle.list; List actualList = - (List) ((OrcStruct) row.getFieldValue(9)).getFieldValue(0); + (List) ((OrcStruct) row.getFieldValue(9)).getFieldValue(0); compareList(expectedList, actualList); - compareList(expected.list, (List) row.getFieldValue(10)); + compareList(expected.list, (List) row.getFieldValue(10)); } rows.close(); Iterator stripeIterator = @@ -1420,7 +1424,8 @@ public void testSeek() throws Exception { private void compareInner(InnerStruct expect, OrcStruct actual) throws Exception { if (expect == null || actual == null) { - assertEquals(expect, actual); + assertEquals(null, expect); + assertEquals(null, actual); } else { assertEquals(expect.int1, ((IntWritable) actual.getFieldValue(0)).get()); assertEquals(expect.string1, actual.getFieldValue(1)); @@ -1517,7 +1522,8 @@ public void testMemoryManagement() throws Exception { } writer.close(); assertEquals(null, memory.path); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); int i = 0; for(StripeInformation stripe: reader.getStripes()) { i += 1; @@ -1542,7 +1548,8 @@ public void testPredicatePushdown() throws Exception { writer.addRow(new InnerStruct(i*300, Integer.toHexString(10*i))); } writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(3500, reader.getNumberOfRows()); SearchArgument sarg = SearchArgument.FACTORY.newBuilder() diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java index 9952bff..2eec73e 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcNullOptimization.java @@ -108,7 +108,8 @@ public void testMultiStripeWithNull() throws Exception { Lists.newArrayList(new InnerStruct(100)))); writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); assertEquals(20000, reader.getNumberOfRows()); @@ -212,7 +213,8 @@ public void testMultiStripeWithoutNull() throws Exception { Lists.newArrayList(new InnerStruct(100)))); writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); assertEquals(20000, reader.getNumberOfRows()); @@ -313,7 +315,8 @@ public void testColumnsWithNullAndCompression() throws Exception { Lists.newArrayList(new InnerStruct(100)))); writer.close(); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); // check the stats ColumnStatistics[] stats = reader.getStatistics(); assertEquals(8, reader.getNumberOfRows()); diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRawRecordMerger.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRawRecordMerger.java new file mode 100644 index 0000000..9bd47d6 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRawRecordMerger.java @@ -0,0 +1,350 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.orc; + +import org.apache.derby.impl.store.raw.data.RecordId; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.io.RecordIdentifier; +import org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.OriginalReaderPair; +import org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey; +import org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderPair; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.junit.Test; +import org.mockito.AdditionalMatchers; +import org.mockito.MockSettings; +import org.mockito.Mockito; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TestOrcRawRecordMerger { + + @Test + public void testOrdering() throws Exception { + ReaderKey left = new ReaderKey(100, 200, 1200, 300); + ReaderKey right = new ReaderKey(); + right.setValues(100, 200, 1000, 200); + assertTrue(right.compareTo(left) < 0); + assertTrue(left.compareTo(right) > 0); + assertEquals(false, left.equals(right)); + left.set(right); + assertTrue(right.compareTo(left) == 0); + assertEquals(true, right.equals(left)); + right.setRowId(2000); + assertTrue(right.compareTo(left) > 0); + left.setValues(1, 2, 3, 4); + right.setValues(100, 2, 3, 4); + assertTrue(left.compareTo(right) < 0); + assertTrue(right.compareTo(left) > 0); + left.setValues(1, 2, 3, 4); + right.setValues(1, 100, 3, 4); + assertTrue(left.compareTo(right) < 0); + assertTrue(right.compareTo(left) > 0); + left.setValues(1, 2, 3, 100); + right.setValues(1, 2, 3, 4); + assertTrue(left.compareTo(right) < 0); + assertTrue(right.compareTo(left) > 0); + + // ensure that we are consistent when comparing to the base class + RecordIdentifier ri = new RecordIdentifier(1, 2, 3); + assertEquals(1, ri.compareTo(left)); + assertEquals(-1, left.compareTo(ri)); + assertEquals(false, ri.equals(left)); + assertEquals(false, left.equals(ri)); + } + + private static void setRow(OrcStruct event, + int operation, + long originalTransaction, + int bucket, + long rowId, + long currentTransaction, + String value) { + event.setFieldValue(OrcRecordUpdater.OPERATION, new IntWritable(operation)); + event.setFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION, + new LongWritable(originalTransaction)); + event.setFieldValue(OrcRecordUpdater.BUCKET, new IntWritable(bucket)); + event.setFieldValue(OrcRecordUpdater.ROW_ID, new LongWritable(rowId)); + event.setFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION, + new LongWritable(currentTransaction)); + OrcStruct row = new OrcStruct(1); + row.setFieldValue(0, new Text(value)); + event.setFieldValue(OrcRecordUpdater.ROW, row); + } + + private static String value(OrcStruct event) { + return OrcRecordUpdater.getRow(event).getFieldValue(0).toString(); + } + + private List createStripes(long... rowCounts) { + long offset = 0; + List result = + new ArrayList(rowCounts.length); + for(long count: rowCounts) { + OrcProto.StripeInformation.Builder stripe = + OrcProto.StripeInformation.newBuilder(); + stripe.setDataLength(800).setIndexLength(100).setFooterLength(100) + .setNumberOfRows(count).setOffset(offset); + offset += 1000; + result.add(new ReaderImpl.StripeInformationImpl(stripe.build())); + } + return result; + } + + private Reader createMockReader(boolean[] includes, long offset, + boolean verbose) throws IOException { + MockSettings settings = Mockito.withSettings(); + if (verbose) { + settings.verboseLogging(); + } + Reader reader = Mockito.mock(Reader.class, settings); + RecordReader recordReader = Mockito.mock(RecordReader.class, settings); + OrcStruct row1 = new OrcStruct(OrcRecordUpdater.FIELDS); + setRow(row1, OrcRecordUpdater.INSERT_OPERATION, 10, 20, 20, 100, "first"); + OrcStruct row2 = new OrcStruct(OrcRecordUpdater.FIELDS); + setRow(row2, OrcRecordUpdater.INSERT_OPERATION, 10, 20, 30, 110, "second"); + OrcStruct row3 = new OrcStruct(OrcRecordUpdater.FIELDS); + setRow(row3, OrcRecordUpdater.INSERT_OPERATION, 10, 20, 40, 120, "third"); + OrcStruct row4 = new OrcStruct(OrcRecordUpdater.FIELDS); + setRow(row4, OrcRecordUpdater.INSERT_OPERATION, 40, 50, 60, 130, "fourth"); + OrcStruct row5 = new OrcStruct(OrcRecordUpdater.FIELDS); + setRow(row5, OrcRecordUpdater.INSERT_OPERATION, 40, 50, 61, 140, "fifth"); + Mockito.when(reader.rows(offset, Long.MAX_VALUE, includes, null, null)) + .thenReturn(recordReader); + + Mockito.when(recordReader.hasNext()). + thenReturn(true, true, true, true, true, false); + + Mockito.when(recordReader.next(null)).thenReturn(row1); + Mockito.when(recordReader.next(row1)).thenReturn(row2); + Mockito.when(recordReader.next(row2)).thenReturn(row3); + Mockito.when(recordReader.next(row3)).thenReturn(row4); + Mockito.when(recordReader.next(row4)).thenReturn(row5); + + Mockito.when(reader.getMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME)) + .thenReturn(ByteBuffer.wrap("10,20,30;40,50,60;40,50,61" + .getBytes("UTF-8"))); + Mockito.when(reader.getStripes()) + .thenReturn(createStripes(2, 2, 1)); + return reader; + } + + @Test + public void testReaderPair() throws Exception { + ReaderKey key = new ReaderKey(); + boolean[] includes = new boolean[7]; + Reader reader = createMockReader(includes, 0, false); + RecordIdentifier minKey = new RecordIdentifier(10, 20, 30); + RecordIdentifier maxKey = new RecordIdentifier(40, 50, 60); + ReaderPair pair = new ReaderPair(key, reader, 20, minKey, maxKey, includes); + RecordReader recordReader = pair.recordReader; + assertEquals(10, key.getTransactionId()); + assertEquals(20, key.getBucketId()); + assertEquals(40, key.getRowId()); + assertEquals(120, key.getCurrentTransactionId()); + assertEquals("third", value(pair.nextRecord)); + + pair.next(); + assertEquals(40, key.getTransactionId()); + assertEquals(50, key.getBucketId()); + assertEquals(60, key.getRowId()); + assertEquals(130, key.getCurrentTransactionId()); + assertEquals("fourth", value(pair.nextRecord)); + + pair.next(); + assertEquals(null, pair.nextRecord); + Mockito.verify(recordReader).close(); + } + + @Test + public void testReaderPairNoMin() throws Exception { + ReaderKey key = new ReaderKey(); + boolean[] includes = new boolean[7]; + Reader reader = createMockReader(includes, 0, false); + + ReaderPair pair = new ReaderPair(key, reader, 20, null, null, includes); + RecordReader recordReader = pair.recordReader; + assertEquals(10, key.getTransactionId()); + assertEquals(20, key.getBucketId()); + assertEquals(20, key.getRowId()); + assertEquals(100, key.getCurrentTransactionId()); + assertEquals("first", value(pair.nextRecord)); + + pair.next(); + assertEquals(10, key.getTransactionId()); + assertEquals(20, key.getBucketId()); + assertEquals(30, key.getRowId()); + assertEquals(110, key.getCurrentTransactionId()); + assertEquals("second", value(pair.nextRecord)); + + pair.next(); + assertEquals(10, key.getTransactionId()); + assertEquals(20, key.getBucketId()); + assertEquals(40, key.getRowId()); + assertEquals(120, key.getCurrentTransactionId()); + assertEquals("third", value(pair.nextRecord)); + + pair.next(); + assertEquals(40, key.getTransactionId()); + assertEquals(50, key.getBucketId()); + assertEquals(60, key.getRowId()); + assertEquals(130, key.getCurrentTransactionId()); + assertEquals("fourth", value(pair.nextRecord)); + + pair.next(); + assertEquals(40, key.getTransactionId()); + assertEquals(50, key.getBucketId()); + assertEquals(61, key.getRowId()); + assertEquals(140, key.getCurrentTransactionId()); + assertEquals("fifth", value(pair.nextRecord)); + + pair.next(); + assertEquals(null, pair.nextRecord); + Mockito.verify(recordReader).close(); + } + + private static OrcStruct createOriginalRow(String value) { + OrcStruct result = new OrcStruct(1); + result.setFieldValue(0, new Text(value)); + return result; + } + + private Reader createMockOriginalReader() throws IOException { + MockSettings settings = Mockito.withSettings(); + Reader reader = Mockito.mock(Reader.class, settings); + RecordReader recordReader = Mockito.mock(RecordReader.class, settings); + OrcStruct row1 = createOriginalRow("first"); + OrcStruct row2 = createOriginalRow("second"); + OrcStruct row3 = createOriginalRow("third"); + OrcStruct row4 = createOriginalRow("fourth"); + OrcStruct row5 = createOriginalRow("fifth"); + + Mockito.when(reader.rows(Mockito.eq(0L), + Mockito.eq(Long.MAX_VALUE), + AdditionalMatchers.aryEq(new boolean[]{true, true}), + Mockito.isNull(SearchArgument.class), + Mockito.isNull(String[].class))) + .thenReturn(recordReader); + Mockito.when(recordReader.hasNext()). + thenReturn(true, true, true, true, true, false); + Mockito.when(recordReader.getRowNumber()).thenReturn(0L, 1L, 2L, 3L, 4L); + Mockito.when(recordReader.next(null)).thenReturn(row1); + Mockito.when(recordReader.next(row1)).thenReturn(row2); + Mockito.when(recordReader.next(row2)).thenReturn(row3); + Mockito.when(recordReader.next(row3)).thenReturn(row4); + Mockito.when(recordReader.next(row4)).thenReturn(row5); + return reader; + } + + @Test + public void testOriginalReaderPair() throws Exception { + ReaderKey key = new ReaderKey(); + Reader reader = createMockOriginalReader(); + RecordIdentifier minKey = new RecordIdentifier(0, 10, 1); + RecordIdentifier maxKey = new RecordIdentifier(0, 10, 3); + boolean[] includes = new boolean[]{false, false, false, false, false, false, + true, true}; + ReaderPair pair = new OriginalReaderPair(key, reader, 10, minKey, maxKey, 0, + includes, null, null); + RecordReader recordReader = pair.recordReader; + assertEquals(0, key.getTransactionId()); + assertEquals(10, key.getBucketId()); + assertEquals(2, key.getRowId()); + assertEquals(0, key.getCurrentTransactionId()); + assertEquals("third", value(pair.nextRecord)); + + pair.next(); + assertEquals(0, key.getTransactionId()); + assertEquals(10, key.getBucketId()); + assertEquals(3, key.getRowId()); + assertEquals(0, key.getCurrentTransactionId()); + assertEquals("fourth", value(pair.nextRecord)); + + pair.next(); + assertEquals(null, pair.nextRecord); + Mockito.verify(recordReader).close(); + } + + @Test + public void testOriginalReaderPairNoMin() throws Exception { + ReaderKey key = new ReaderKey(); + Reader reader = createMockOriginalReader(); + boolean[] includes = new boolean[]{false, false, false, false, false, false, + true, true}; + ReaderPair pair = new OriginalReaderPair(key, reader, 10, null, null, 0, + includes, null, null); + assertEquals(0, key.getTransactionId()); + assertEquals(10, key.getBucketId()); + assertEquals(0, key.getRowId()); + assertEquals(0, key.getCurrentTransactionId()); + assertEquals("first", value(pair.nextRecord)); + + pair.next(); + assertEquals(0, key.getTransactionId()); + assertEquals(10, key.getBucketId()); + assertEquals(1, key.getRowId()); + assertEquals(0, key.getCurrentTransactionId()); + assertEquals("second", value(pair.nextRecord)); + + pair.next(); + assertEquals(0, key.getTransactionId()); + assertEquals(10, key.getBucketId()); + assertEquals(2, key.getRowId()); + assertEquals(0, key.getCurrentTransactionId()); + assertEquals("third", value(pair.nextRecord)); + + pair.next(); + assertEquals(0, key.getTransactionId()); + assertEquals(10, key.getBucketId()); + assertEquals(3, key.getRowId()); + assertEquals(0, key.getCurrentTransactionId()); + assertEquals("fourth", value(pair.nextRecord)); + + pair.next(); + assertEquals(0, key.getTransactionId()); + assertEquals(10, key.getBucketId()); + assertEquals(4, key.getRowId()); + assertEquals(0, key.getCurrentTransactionId()); + assertEquals("fifth", value(pair.nextRecord)); + + pair.next(); + assertEquals(null, pair.nextRecord); + Mockito.verify(pair.recordReader).close(); + } + + @Test + public void testInitialization() throws Exception { + Configuration conf = new Configuration(); + boolean[] includes = new boolean[8]; + Reader reader = createMockReader(includes, 1000, false); + OrcRawRecordMerger merger = new OrcRawRecordMerger(conf, false, reader, + false, 10, 1000, 1000, includes, null, null); + + assertEquals(new RecordIdentifier(10, 20, 30), merger.getMinKey()); + assertEquals(new RecordIdentifier(40, 50, 60), merger.getMaxKey()); + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java new file mode 100644 index 0000000..77c3987 --- /dev/null +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcRecordUpdater.java @@ -0,0 +1,213 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.AcidOutputFormat; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.io.RecordUpdater; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.Reporter; +import org.junit.Test; + +import java.io.DataInputStream; +import java.io.File; + +import static org.junit.Assert.assertEquals; + +public class TestOrcRecordUpdater { + + @Test + public void testAccessors() throws Exception { + OrcStruct event = new OrcStruct(OrcRecordUpdater.FIELDS); + event.setFieldValue(OrcRecordUpdater.OPERATION, + new IntWritable(OrcRecordUpdater.INSERT_OPERATION)); + event.setFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION, + new LongWritable(100)); + event.setFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION, + new LongWritable(50)); + event.setFieldValue(OrcRecordUpdater.BUCKET, new IntWritable(200)); + event.setFieldValue(OrcRecordUpdater.ROW_ID, new LongWritable(300)); + assertEquals(OrcRecordUpdater.INSERT_OPERATION, + OrcRecordUpdater.getOperation(event)); + assertEquals(50, OrcRecordUpdater.getOriginalTransaction(event)); + assertEquals(100, OrcRecordUpdater.getCurrentTransaction(event)); + assertEquals(200, OrcRecordUpdater.getBucket(event)); + assertEquals(300, OrcRecordUpdater.getRowId(event)); + } + + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + + static class MyRow { + Text field; + MyRow(String val) { + field = new Text(val); + } + } + + @Test + public void testWriter() throws Exception { + Path root = new Path(workDir, "testWriter"); + Configuration conf = new Configuration(); + // Must use raw local because the checksummer doesn't honor flushes. + FileSystem fs = FileSystem.getLocal(conf).getRaw(); + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf) + .filesystem(fs) + .bucket(10) + .writingBase(true) + .maximumTransactionId(100) + .inspector(inspector) + .reporter(Reporter.NULL); + RecordUpdater updater = new OrcRecordUpdater(root, options); + updater.insert(0, 1, new MyRow("first")); + updater.insert(0, 2, new MyRow("second")); + updater.insert(0, 3, new MyRow("third")); + updater.flush(); + updater.insert(10, 1, new MyRow("fourth")); + updater.insert(10, 1, new MyRow("fifth")); + updater.flush(); + Path bucketPath = AcidUtils.createFilename(root, options); + Path sidePath = OrcRecordUpdater.getSideFile(bucketPath); + DataInputStream side = fs.open(sidePath); + + // read the stopping point for the first flush and make sure we only see + // 3 rows + long len = side.readLong(); + Reader reader = OrcFile.createReader(bucketPath, + new OrcFile.ReaderOptions(conf).filesystem(fs).maxLength(len)); + assertEquals(3, reader.getNumberOfRows()); + + // read the second flush and make sure we see all 5 rows + len = side.readLong(); + side.close(); + reader = OrcFile.createReader(bucketPath, + new OrcFile.ReaderOptions(conf).filesystem(fs).maxLength(len)); + assertEquals(5, reader.getNumberOfRows()); + RecordReader rows = reader.rows(null); + + // check the contents of the file + assertEquals(true, rows.hasNext()); + OrcStruct row = (OrcStruct) rows.next(null); + assertEquals(OrcRecordUpdater.INSERT_OPERATION, + OrcRecordUpdater.getOperation(row)); + assertEquals(0, OrcRecordUpdater.getCurrentTransaction(row)); + assertEquals(0, OrcRecordUpdater.getOriginalTransaction(row)); + assertEquals(1, OrcRecordUpdater.getBucket(row)); + assertEquals(0, OrcRecordUpdater.getRowId(row)); + assertEquals("first", + OrcRecordUpdater.getRow(row).getFieldValue(0).toString()); + assertEquals(true, rows.hasNext()); + row = (OrcStruct) rows.next(null); + assertEquals(1, OrcRecordUpdater.getRowId(row)); + assertEquals(2, OrcRecordUpdater.getBucket(row)); + assertEquals("second", + OrcRecordUpdater.getRow(row).getFieldValue(0).toString()); + assertEquals(true, rows.hasNext()); + row = (OrcStruct) rows.next(null); + assertEquals(2, OrcRecordUpdater.getRowId(row)); + assertEquals(3, OrcRecordUpdater.getBucket(row)); + assertEquals("third", + OrcRecordUpdater.getRow(row).getFieldValue(0).toString()); + assertEquals(true, rows.hasNext()); + row = (OrcStruct) rows.next(null); + assertEquals(10, OrcRecordUpdater.getCurrentTransaction(row)); + assertEquals(10, OrcRecordUpdater.getOriginalTransaction(row)); + assertEquals(1, OrcRecordUpdater.getBucket(row)); + assertEquals(0, OrcRecordUpdater.getRowId(row)); + assertEquals("fourth", + OrcRecordUpdater.getRow(row).getFieldValue(0).toString()); + assertEquals(true, rows.hasNext()); + row = (OrcStruct) rows.next(null); + assertEquals(1, OrcRecordUpdater.getRowId(row)); + assertEquals("fifth", + OrcRecordUpdater.getRow(row).getFieldValue(0).toString()); + assertEquals(false, rows.hasNext()); + + // add one more record and close + updater.insert(20, 0, new MyRow("sixth")); + updater.close(false); + reader = OrcFile.createReader(bucketPath, + new OrcFile.ReaderOptions(conf).filesystem(fs)); + assertEquals(6, reader.getNumberOfRows()); + assertEquals(false, fs.exists(sidePath)); + } + + @Test + public void testUpdates() throws Exception { + Path root = new Path(workDir, "testUpdates"); + Configuration conf = new Configuration(); + FileSystem fs = root.getFileSystem(conf); + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf) + .filesystem(fs) + .bucket(10) + .writingBase(false) + .minimumTransactionId(100) + .maximumTransactionId(100) + .inspector(inspector) + .reporter(Reporter.NULL); + RecordUpdater updater = new OrcRecordUpdater(root, options); + updater.update(100, 10, 20, 30, new MyRow("update")); + updater.delete(100, 40, 50, 60); + updater.close(false); + Path bucketPath = AcidUtils.createFilename(root, options); + + Reader reader = OrcFile.createReader(bucketPath, + new OrcFile.ReaderOptions(conf).filesystem(fs)); + assertEquals(2, reader.getNumberOfRows()); + + RecordReader rows = reader.rows(null); + + // check the contents of the file + assertEquals(true, rows.hasNext()); + OrcStruct row = (OrcStruct) rows.next(null); + assertEquals(OrcRecordUpdater.UPDATE_OPERATION, + OrcRecordUpdater.getOperation(row)); + assertEquals(100, OrcRecordUpdater.getCurrentTransaction(row)); + assertEquals(10, OrcRecordUpdater.getOriginalTransaction(row)); + assertEquals(20, OrcRecordUpdater.getBucket(row)); + assertEquals(30, OrcRecordUpdater.getRowId(row)); + assertEquals("update", + OrcRecordUpdater.getRow(row).getFieldValue(0).toString()); + assertEquals(true, rows.hasNext()); + row = (OrcStruct) rows.next(null); + assertEquals(100, OrcRecordUpdater.getCurrentTransaction(row)); + assertEquals(40, OrcRecordUpdater.getOriginalTransaction(row)); + assertEquals(50, OrcRecordUpdater.getBucket(row)); + assertEquals(60, OrcRecordUpdater.getRowId(row)); + assertEquals(null, OrcRecordUpdater.getRow(row)); + assertEquals(false, rows.hasNext()); + } +} diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSerDeStats.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSerDeStats.java index 0732534..16e6eb9 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSerDeStats.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestOrcSerDeStats.java @@ -25,6 +25,7 @@ import java.io.File; import java.sql.Timestamp; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -96,9 +97,7 @@ public MapStruct(Map m1) { MiddleStruct(InnerStruct... items) { list.clear(); - for (InnerStruct item : items) { - list.add(item); - } + list.addAll(Arrays.asList(items)); } } @@ -158,9 +157,7 @@ private static InnerStruct inner(int i, String s) { private static List list(InnerStruct... items) { List result = new ArrayList(); - for (InnerStruct s : items) { - result.add(s); - } + result.addAll(Arrays.asList(items)); return result; } @@ -212,7 +209,8 @@ public void testStringAndBinaryStatistics() throws Exception { writer.close(); assertEquals(4, writer.getNumberOfRows()); assertEquals(273, writer.getRawDataSize()); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(4, reader.getNumberOfRows()); assertEquals(273, reader.getRawDataSize()); assertEquals(15, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1"))); @@ -310,7 +308,8 @@ public void testOrcSerDeStatsList() throws Exception { assertEquals(5000, writer.getNumberOfRows()); assertEquals(430000000, writer.getRawDataSize()); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); // stats from reader assertEquals(5000, reader.getNumberOfRows()); assertEquals(430000000, reader.getRawDataSize()); @@ -341,7 +340,8 @@ public void testOrcSerDeStatsMap() throws Exception { assertEquals(1000, writer.getNumberOfRows()); assertEquals(950000, writer.getRawDataSize()); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); // stats from reader assertEquals(1000, reader.getNumberOfRows()); assertEquals(950000, reader.getRawDataSize()); @@ -372,7 +372,8 @@ public void testOrcSerDeStatsSimpleWithNulls() throws Exception { assertEquals(1000, writer.getNumberOfRows()); assertEquals(44500, writer.getRawDataSize()); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); // stats from reader assertEquals(1000, reader.getNumberOfRows()); assertEquals(44500, reader.getRawDataSize()); @@ -413,7 +414,8 @@ public void testOrcSerDeStatsComplex() throws Exception { long rawDataSize = writer.getRawDataSize(); assertEquals(2, rowCount); assertEquals(1740, rawDataSize); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(2, reader.getNumberOfRows()); assertEquals(1740, reader.getRawDataSize()); @@ -506,7 +508,8 @@ public void testOrcSerDeStatsComplexOldFormat() throws Exception { long rawDataSize = writer.getRawDataSize(); assertEquals(2, rowCount); assertEquals(1740, rawDataSize); - Reader reader = OrcFile.createReader(fs, testFilePath, conf); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(2, reader.getNumberOfRows()); assertEquals(1740, reader.getRawDataSize()); @@ -573,7 +576,8 @@ public void testOrcSerDeStatsComplexOldFormat() throws Exception { @Test(expected = ClassCastException.class) public void testSerdeStatsOldFormat() throws Exception { Path oldFilePath = new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc")); - Reader reader = OrcFile.createReader(fs, oldFilePath, conf); + Reader reader = OrcFile.createReader(oldFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); int stripeCount = 0; int rowCount = 0; diff --git ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java index 10534c0..4970b6b 100644 --- ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java +++ ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestVectorizedORCReader.java @@ -22,7 +22,6 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.type.Decimal128; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.io.DateWritable; @@ -64,6 +63,7 @@ public void openFileSystem() throws Exception { fs.delete(testFilePath, false); } + @SuppressWarnings("unused") static class MyRecord { private final Boolean bo; private final Byte by; @@ -131,8 +131,10 @@ public void createFile() throws Exception { private void checkVectorizedReader() throws Exception { - Reader vreader = OrcFile.createReader(testFilePath.getFileSystem(conf), testFilePath, conf); - Reader reader = OrcFile.createReader(testFilePath.getFileSystem(conf), testFilePath, conf); + Reader vreader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf)); + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf)); RecordReaderImpl vrr = (RecordReaderImpl) vreader.rows(null); RecordReaderImpl rr = (RecordReaderImpl) reader.rows(null); VectorizedRowBatch batch = null; @@ -142,7 +144,7 @@ private void checkVectorizedReader() throws Exception { while (vrr.hasNext()) { batch = vrr.nextBatch(batch); for (int i = 0; i < batch.size; i++) { - row = (OrcStruct) rr.next((Object) row); + row = (OrcStruct) rr.next(row); for (int j = 0; j < batch.cols.length; j++) { Object a = (row.getFieldValue(j)); Object b = batch.cols[j].getWritableObject(i);