diff --git data/files/teradata_binary_file/td_data_with_1mb_rowsize.teradata.gz data/files/teradata_binary_file/td_data_with_1mb_rowsize.teradata.gz new file mode 100644 index 0000000000000000000000000000000000000000..7319e3c5fb3b6e0408d87bed575a2224dca3507f GIT binary patch literal 616 zcmV-u0+;IOCjM@J_oXJ-Zm zE>EC(AqCb3o{u*W2IaL$fB31$>A>)7sy{na%*epd5W9g23JS~&j!q!cvcXCl8EU@+ zoh91%vi6IBk|0oIiP;VzCo4u|=NTE{be-X7UnfL5%AY0zZ?Gkd@UInpF-tpc) z1)RW9L4lK@BqK2|TcI+)loMt&&@+Bu!|VUogG`LDL%;3JS|=7L?^>)J18&1T z6%;`6#RPM&7f=VQ0uy5cCy-IG<%ehdtu4F^3<7uOr_?$jM;~q@fkDHLo$$hT_@JW{r>JdAal_aDWGu+N-s^31$toHat2_y8tNKi3vPQ*Ed#T+OxQlpsj z1*&&*0G4GrV671M{7B|H#630C4%tv6154ZnGB6-K3^K970ccV#!axQFFFl`Xjy293 zSdg4(2{aKHZ^niMOoT_nY literal 0 HcmV?d00001 diff --git data/files/teradata_binary_file/teradata_binary_table.deflate data/files/teradata_binary_file/teradata_binary_table.deflate new file mode 100644 index 0000000000000000000000000000000000000000..fd53ddeb8c14f73604b7f45aa42d2436ccd8c7e5 GIT binary patch literal 1329 zcmV-11JhdDkvjG)ip{ zXzeSlAvO@((vnE42e)SVM{4dwIm@USnHjs9)A<$@;nq$pdc**FqJkxkQrRTwvDl|K=Bv@i1;-RupPhI!1(h=sehV! zBJo-L`M0kA{FvlUWaa>%MqU|mNFbpw#tyt$xtZS0*}k02JWr7~d)dKUgA{nKNM>J( zj0#kWZ$v%P&ItuXy^^8Z;YlEI((1oh-`MGleXM?iGL#TQ3k?#005E#9obCC{Lkv(Q z6mbccc4ipOuM6@iXadz5tuJ|zdiv0DGQXii{k<(YPZ(NIgc1h@ms+FVVLLtL-@V?9 zMjW=F0svKz56a0^pHMD_=zI~$qus2#*X!?iy8Yb5esR#yci zGIAmBG%xZhaHVvX9_+bc*MryACt!a`ss?E=%hXeYQ#C+u2(=M`C>E9)oToUUB7|-l zO$#Z z3Mg=E^EA`$VXS&2o?LcQkJ@{R`vH~LxoM?)WQPnbX-+xS}-Z~d% zLPmkZ+fFT`4Zwq2X*#aZ)F`7E<(6)11i%80ynNeQ@34!+?fMFcR)Q= z`&tcgW31!;tHrZv?hE<=XrCb-Mb(|z0{Xi`=&FX*q4=!)LEp$RG|EEg96(m$vW2J} zWf~k=lVqJ~mc#LxcAqUDWpuHWHGxv%IZ8eG&9y)iEudQu$^*`+26#P4WCn^nt4vgY n5{XmXOiA1$6FSGOkN)en=nazg(!!ztd(a%L5cmHBknDH^kvMdF literal 0 HcmV?d00001 diff --git ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileInputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileInputFormat.java new file mode 100644 index 0000000000..bed87c582e --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileInputFormat.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + +import java.io.IOException; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; + +/** + * https://cwiki.apache.org/confluence/display/Hive/TeradataBinarySerde. + * FileInputFormat for Teradata binary files. + * + * In the Teradata Binary File, each record constructs as below: + * The first 2 bytes represents the length of the bytes next for this record. + * Then the null bitmap whose length is depended on the number of fields is followed. + * Then each field of the record is serialized into bytes - the serialization strategy is decided by the type of field. + * At last, there is one byte (0x0a) in the end of the record. + * + * This InputFormat currently doesn't support the split of the file. + * Teradata binary files are using little endian. + */ +public class TeradataBinaryFileInputFormat extends FileInputFormat { + + @Override public RecordReader getRecordReader(InputSplit split, JobConf job, + Reporter reporter) throws IOException { + reporter.setStatus(split.toString()); + return new TeradataBinaryRecordReader(job, (FileSplit) split); + } + + /** + * the TeradataBinaryFileInputFormat is not splittable right now. + * Override the isSplitable function. + * + * @param fs the file system that the file is on + * @param filename the file name to check + * @return is this file splitable? + */ + @Override protected boolean isSplitable(FileSystem fs, Path filename) { + return false; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileOutputFormat.java ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileOutputFormat.java new file mode 100644 index 0000000000..0469825091 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryFileOutputFormat.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Properties; + +import org.apache.commons.io.EndianUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.util.Progressable; + +import static java.lang.String.format; + +/** + * https://cwiki.apache.org/confluence/display/Hive/TeradataBinarySerde. + * FileOutputFormat for Teradata binary files. + * + * In the Teradata Binary File, each record constructs as below: + * The first 2 bytes represents the length of the bytes next for this record (null bitmap and fields). + * Then the null bitmap whose length is depended on the number of fields is followe. + * Then each field of the record is serialized into bytes - the serialization strategy is decided by the type of field. + * At last, there is one byte (0x0a) in the end of the record. + * + * Teradata binary files are using little endian. + */ +public class TeradataBinaryFileOutputFormat + extends HiveIgnoreKeyTextOutputFormat { + private static final Log LOG = LogFactory.getLog(TeradataBinaryFileOutputFormat.class); + + static final byte RECORD_END_BYTE = (byte) 0x0a; + + /** + * create the final out file, and output row by row. After one row is + * appended, a configured row separator is appended + * + * @param jc + * the job configuration file + * @param outPath + * the final output file to be created + * @param valueClass + * the value class used for create + * @param isCompressed + * whether the content is compressed or not + * @param tableProperties + * the tableProperties of this file's corresponding table + * @param progress + * progress used for status report + * @return the RecordWriter + */ + @Override public RecordWriter getHiveRecordWriter(JobConf jc, Path outPath, Class valueClass, + boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException { + FileSystem fs = outPath.getFileSystem(jc); + final OutputStream outStream = Utilities.createCompressedStream(jc, fs.create(outPath, progress), isCompressed); + return new RecordWriter() { + @Override public void write(Writable r) throws IOException { + BytesWritable bw = (BytesWritable) r; + int recordLength = bw.getLength(); + + //Based on the row length to decide if the length is int or short + String rowLength = tableProperties + .getProperty(TeradataBinaryRecordReader.TD_ROW_LENGTH, TeradataBinaryRecordReader.DEFAULT_TD_ROW_LENGTH) + .toLowerCase(); + LOG.debug(format("The table property %s is: %s", TeradataBinaryRecordReader.TD_ROW_LENGTH, rowLength)); + + if (TeradataBinaryRecordReader.TD_ROW_LENGTH_TO_BYTE_NUM.containsKey(rowLength)) { + if (rowLength.equals(TeradataBinaryRecordReader.DEFAULT_TD_ROW_LENGTH)) { + EndianUtils.writeSwappedShort(outStream, (short) recordLength); // write the length using little endian + } else if (rowLength.equals(TeradataBinaryRecordReader.TD_ROW_LENGTH_1MB)) { + EndianUtils.writeSwappedInteger(outStream, recordLength); // write the length using little endian + } + } else { + throw new IllegalArgumentException(format("%s doesn't support the value %s, the supported values are %s", + TeradataBinaryRecordReader.TD_ROW_LENGTH, rowLength, + TeradataBinaryRecordReader.TD_ROW_LENGTH_TO_BYTE_NUM.keySet())); + } + + outStream.write(bw.getBytes(), 0, bw.getLength()); // write the content (the content is in little endian) + outStream.write(RECORD_END_BYTE); //write the record ending + } + + @Override public void close(boolean abort) throws IOException { + outStream.close(); + } + }; + } +} diff --git ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryRecordReader.java ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryRecordReader.java new file mode 100644 index 0000000000..337b5d2e76 --- /dev/null +++ ql/src/java/org/apache/hadoop/hive/ql/io/TeradataBinaryRecordReader.java @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.util.Map; + +import com.google.common.collect.ImmutableMap; +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.io.EndianUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.Seekable; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.plan.PartitionDesc; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; + +import static java.lang.String.format; + +/** + * The TeradataBinaryRecordReader reads the record from Teradata binary files. + * + * In the Teradata Binary File, each record constructs as below: + * The first 2 bytes represents the length of the bytes next for this record. + * Then the null bitmap whose length is depended on the number of fields is followed. + * Then each field of the record is serialized into bytes - the serialization strategy is decided by the type of field. + * At last, there is one byte (0x0a) in the end of the record. + * + * This InputFormat currently doesn't support the split of the file. + * Teradata binary files are using little endian. + */ +public class TeradataBinaryRecordReader implements RecordReader { + + private static final Log LOG = LogFactory.getLog(TeradataBinaryRecordReader.class); + + private CompressionCodecFactory compressionCodecs = null; + private InputStream in; + private long start; + private long pos; + private long end; + private final Seekable filePosition; + private CompressionCodec codec; + + static final String TD_ROW_LENGTH = "teradata.row.length"; + static final Map TD_ROW_LENGTH_TO_BYTE_NUM = ImmutableMap.of("64kb", 2, "1mb", 4); + static final String DEFAULT_TD_ROW_LENGTH = "64kb"; + static final String TD_ROW_LENGTH_1MB = "1mb"; + + private byte[] recordLengthBytes; + private byte[] valueByteArray = new byte[65536]; // max byte array + private byte[] endOfRecord = new byte[1]; + + private int recordLength = 0; + + public TeradataBinaryRecordReader(JobConf job, FileSplit fileSplit) throws IOException { + LOG.debug("initialize the TeradataBinaryRecordReader"); + + String rowLength = job.get(TD_ROW_LENGTH); + if (rowLength == null) { + LOG.debug("No table property in JobConf. Try to recover the table directly"); + Map partitionDescMap = Utilities.getMapRedWork(job).getMapWork().getAliasToPartnInfo(); + for (String alias : Utilities.getMapRedWork(job).getMapWork().getAliasToPartnInfo().keySet()) { + LOG.debug(format("the current alias: %s", alias)); + rowLength = partitionDescMap.get(alias).getTableDesc().getProperties().getProperty(TD_ROW_LENGTH); + if (rowLength != null) { + break; + } + } + } + + if (rowLength == null) { + rowLength = DEFAULT_TD_ROW_LENGTH; + } else { + rowLength = rowLength.toLowerCase(); + } + + if (TD_ROW_LENGTH_TO_BYTE_NUM.containsKey(rowLength)) { + recordLengthBytes = new byte[TD_ROW_LENGTH_TO_BYTE_NUM.get(rowLength)]; + } else { + throw new IllegalArgumentException( + format("%s doesn't support the value %s, the supported values are %s", TD_ROW_LENGTH, rowLength, + TD_ROW_LENGTH_TO_BYTE_NUM.keySet())); + } + + start = fileSplit.getStart(); + end = start + fileSplit.getLength(); + + LOG.debug(format("The start of the file split is: %s", start)); + LOG.debug(format("The end of the file split is: %s", end)); + + final Path file = fileSplit.getPath(); + compressionCodecs = new CompressionCodecFactory(job); + codec = compressionCodecs.getCodec(file); + FileSystem fs = file.getFileSystem(job); + FSDataInputStream fileIn = fs.open(fileSplit.getPath()); + + /* currently the TeradataBinaryRecord file doesn't support file split at all */ + filePosition = fileIn; + if (isCompressedInput()) { + LOG.info(format("Input file is compressed. Using compression code %s", codec.getClass().getName())); + in = codec.createInputStream(fileIn); + } else { + LOG.info("The input file is not compressed"); + in = fileIn; + } + pos = start; + } + + /** + * Reads the next key/value pair from the input for processing. + * + * @param key the key to read data into + * @param value the value to read data into + * @return true iff a key/value was read, false if at EOF + */ + @Override public synchronized boolean next(NullWritable key, BytesWritable value) throws IOException { + + /* read the record length */ + int lengthExpected = recordLengthBytes.length; + int hasConsumed = readExpectedBytes(recordLengthBytes, lengthExpected); + if (hasConsumed == 0) { + LOG.info("Reach the End of File. No more record"); + return false; + } else if (hasConsumed < lengthExpected) { + LOG.error( + format("We expect %s bytes for the record length but read %d byte and reach the End of File.", lengthExpected, + hasConsumed)); + LOG.error(format("The current position in the file : %s", getFilePosition())); + LOG.error(format("The current consumed bytes: %s", pos)); + LOG.error(format("The bytes for the current record is: %s", Hex.encodeHexString(recordLengthBytes))); + throw new EOFException("When reading the record length, reach the unexpected end of file."); + } + /* get the record contend length to prepare to read the content */ + recordLength = EndianUtils.readSwappedUnsignedShort(recordLengthBytes, 0); + pos += lengthExpected; + + /* read the record content */ + lengthExpected = recordLength; + hasConsumed = readExpectedBytes(valueByteArray, lengthExpected); + if (hasConsumed < lengthExpected) { + LOG.error(format("We expect %s bytes for the record content but read %d byte and reach the End of File.", + lengthExpected, hasConsumed)); + LOG.error(format("The current position in the file : %s", getFilePosition())); + LOG.error(format("The current consumed bytes: %s", pos)); + LOG.error(format("The bytes for the current record is: %s", + Hex.encodeHexString(recordLengthBytes) + Hex.encodeHexString(valueByteArray))); + throw new EOFException("When reading the contend of the record, reach the unexpected end of file."); + } + value.set(valueByteArray, 0, recordLength); + pos += lengthExpected; + + /* read the record end */ + lengthExpected = endOfRecord.length; + hasConsumed = readExpectedBytes(endOfRecord, lengthExpected); + if (hasConsumed < lengthExpected) { + LOG.error(format("We expect %s bytes for the record end symbol but read %d byte and reach the End of File.", + lengthExpected, hasConsumed)); + LOG.error(format("The current position in the file : %s", getFilePosition())); + LOG.error(format("The current consumed bytes: %s", pos)); + LOG.error(format("The bytes for the current record is: %s", + Hex.encodeHexString(recordLengthBytes) + Hex.encodeHexString(valueByteArray) + Hex + .encodeHexString(endOfRecord))); + throw new EOFException("When reading the end of record, reach the unexpected end of file."); + } + + if (endOfRecord[0] != TeradataBinaryFileOutputFormat.RECORD_END_BYTE) { + throw new IOException(format("We expect 0x0a as the record end but get %s.", Hex.encodeHexString(endOfRecord))); + } + pos += lengthExpected; + + return true; + } + + /** + * Create an object of the appropriate type to be used as a key. + * + * @return a new key object. + */ + @Override public NullWritable createKey() { + return NullWritable.get(); + } + + /** + * Create an object of the appropriate type to be used as a value. + * + * @return a new value object. + */ + @Override public BytesWritable createValue() { + return new BytesWritable(); + } + + /** + * Returns the current position in the input. + * + * @return the current position in the input. + * @throws IOException + */ + @Override public long getPos() throws IOException { + return pos; + } + + /** + * + * @throws IOException + */ + @Override public void close() throws IOException { + if (in != null) { + in.close(); + } + } + + /** + * How much of the input has the {@link RecordReader} consumed i.e. + * has been processed by? + * + * @return progress from 0.0 to 1.0. + * @throws IOException + */ + @Override public float getProgress() throws IOException { + if (start == end) { + return 0.0F; + } else { + return Math.min(1.0F, (float) (getFilePosition() - start) / (float) (end - start)); + } + } + + private boolean isCompressedInput() { + return codec != null; + } + + private synchronized long getFilePosition() throws IOException { + long retVal; + if (isCompressedInput() && filePosition != null) { + retVal = filePosition.getPos(); + } else { + retVal = getPos(); + } + return retVal; + } + + private synchronized int readExpectedBytes(byte[] toWrite, int lengthExpected) throws IOException { + int curPos = 0; + do { + int numOfByteRead = in.read(toWrite, curPos, lengthExpected - curPos); + if (numOfByteRead < 0) { + return curPos; + } else { + curPos += numOfByteRead; + } + } while (curPos < lengthExpected); + return curPos; + } +} diff --git ql/src/test/queries/clientpositive/test_teradatabinaryfile.q ql/src/test/queries/clientpositive/test_teradatabinaryfile.q new file mode 100644 index 0000000000..33ab6770f6 --- /dev/null +++ ql/src/test/queries/clientpositive/test_teradatabinaryfile.q @@ -0,0 +1,123 @@ +DROP TABLE if exists teradata_binary_table_64kb; +DROP TABLE if exists teradata_binary_table_1mb; +DROP TABLE if exists teradata_binary_table_64kb_insert; +DROP TABLE if exists teradata_binary_table_1mb_insert; + + +CREATE TABLE `teradata_binary_table_64kb`( + `test_tinyint` tinyint, + `test_smallint` smallint, + `test_int` int, + `test_bigint` bigint, + `test_double` double, + `test_decimal` decimal(15,2), + `test_date` date, + `test_timestamp` timestamp, + `test_char` char(1), + `test_varchar` varchar(40), + `test_binary` binary + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='0', + 'teradata.char.charset'='LATIN', + 'teradata.row.length'='64KB' +); + +CREATE TABLE `teradata_binary_table_1mb`( + `test_tinyint` tinyint, + `test_smallint` smallint, + `test_int` int, + `test_bigint` bigint, + `test_double` double, + `test_decimal` decimal(15,2), + `test_date` date, + `test_timestamp` timestamp, + `test_char` char(1), + `test_varchar` varchar(40), + `test_binary` binary + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='6', + 'teradata.char.charset'='UNICODE', + 'teradata.row.length'='1MB' +); + +CREATE TABLE `teradata_binary_table_64kb_insert`( + `test_tinyint` tinyint, + `test_decimal` decimal(15,2), + `test_date` date, + `test_timestamp` timestamp + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='0', + 'teradata.char.charset'='LATIN', + 'teradata.row.length'='64KB' +); + +CREATE TABLE `teradata_binary_table_1mb_insert`( + `test_tinyint` tinyint, + `test_int` int + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='6', + 'teradata.char.charset'='UNICODE', + 'teradata.row.length'='1MB' +); + +LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/teradata_binary_table.deflate' OVERWRITE INTO TABLE teradata_binary_table_64kb; +LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/td_data_with_1mb_rowsize.teradata.gz' OVERWRITE INTO TABLE teradata_binary_table_1mb; + +SELECT * from teradata_binary_table_64kb; +SELECT * from teradata_binary_table_1mb; + +SELECT COUNT(*) FROM teradata_binary_table_64kb; +SELECT COUNT(*) FROM teradata_binary_table_1mb; + +SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_64kb; +SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_64kb; +SELECT max(Floor(test_decimal)) FROM teradata_binary_table_64kb; + +SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_1mb; +SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_1mb; +SELECT max(Floor(test_decimal)) FROM teradata_binary_table_1mb; + +SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_64kb GROUP BY test_tinyint; +SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_1mb GROUP BY test_tinyint; + +INSERT OVERWRITE TABLE teradata_binary_table_64kb_insert +SELECT test_tinyint, test_decimal, test_date, test_timestamp FROM teradata_binary_table_64kb; + +INSERT OVERWRITE TABLE teradata_binary_table_1mb_insert +SELECT 1, 15; + +DESC FORMATTED teradata_binary_table_64kb_insert; +DESC FORMATTED teradata_binary_table_1mb_insert; + +DROP TABLE if exists teradata_binary_table_64kb; +DROP TABLE if exists teradata_binary_table_1mb; +DROP TABLE if exists teradata_binary_table_64kb_insert; +DROP TABLE if exists teradata_binary_table_1mb_insert; diff --git ql/src/test/results/clientpositive/test_teradatabinaryfile.q.out ql/src/test/results/clientpositive/test_teradatabinaryfile.q.out new file mode 100644 index 0000000000..9db13728d1 --- /dev/null +++ ql/src/test/results/clientpositive/test_teradatabinaryfile.q.out @@ -0,0 +1,537 @@ +PREHOOK: query: DROP TABLE if exists teradata_binary_table_64kb +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE if exists teradata_binary_table_64kb +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE if exists teradata_binary_table_1mb +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE if exists teradata_binary_table_1mb +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE if exists teradata_binary_table_64kb_insert +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE if exists teradata_binary_table_64kb_insert +POSTHOOK: type: DROPTABLE +PREHOOK: query: DROP TABLE if exists teradata_binary_table_1mb_insert +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE if exists teradata_binary_table_1mb_insert +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE `teradata_binary_table_64kb`( + `test_tinyint` tinyint, + `test_smallint` smallint, + `test_int` int, + `test_bigint` bigint, + `test_double` double, + `test_decimal` decimal(15,2), + `test_date` date, + `test_timestamp` timestamp, + `test_char` char(1), + `test_varchar` varchar(40), + `test_binary` binary + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='0', + 'teradata.char.charset'='LATIN', + 'teradata.row.length'='64KB' +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@teradata_binary_table_64kb +POSTHOOK: query: CREATE TABLE `teradata_binary_table_64kb`( + `test_tinyint` tinyint, + `test_smallint` smallint, + `test_int` int, + `test_bigint` bigint, + `test_double` double, + `test_decimal` decimal(15,2), + `test_date` date, + `test_timestamp` timestamp, + `test_char` char(1), + `test_varchar` varchar(40), + `test_binary` binary + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='0', + 'teradata.char.charset'='LATIN', + 'teradata.row.length'='64KB' +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@teradata_binary_table_64kb +PREHOOK: query: CREATE TABLE `teradata_binary_table_1mb`( + `test_tinyint` tinyint, + `test_smallint` smallint, + `test_int` int, + `test_bigint` bigint, + `test_double` double, + `test_decimal` decimal(15,2), + `test_date` date, + `test_timestamp` timestamp, + `test_char` char(1), + `test_varchar` varchar(40), + `test_binary` binary + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='6', + 'teradata.char.charset'='UNICODE', + 'teradata.row.length'='1MB' +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@teradata_binary_table_1mb +POSTHOOK: query: CREATE TABLE `teradata_binary_table_1mb`( + `test_tinyint` tinyint, + `test_smallint` smallint, + `test_int` int, + `test_bigint` bigint, + `test_double` double, + `test_decimal` decimal(15,2), + `test_date` date, + `test_timestamp` timestamp, + `test_char` char(1), + `test_varchar` varchar(40), + `test_binary` binary + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='6', + 'teradata.char.charset'='UNICODE', + 'teradata.row.length'='1MB' +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@teradata_binary_table_1mb +PREHOOK: query: CREATE TABLE `teradata_binary_table_64kb_insert`( + `test_tinyint` tinyint, + `test_decimal` decimal(15,2), + `test_date` date, + `test_timestamp` timestamp + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='0', + 'teradata.char.charset'='LATIN', + 'teradata.row.length'='64KB' +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@teradata_binary_table_64kb_insert +POSTHOOK: query: CREATE TABLE `teradata_binary_table_64kb_insert`( + `test_tinyint` tinyint, + `test_decimal` decimal(15,2), + `test_date` date, + `test_timestamp` timestamp + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='0', + 'teradata.char.charset'='LATIN', + 'teradata.row.length'='64KB' +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@teradata_binary_table_64kb_insert +PREHOOK: query: CREATE TABLE `teradata_binary_table_1mb_insert`( + `test_tinyint` tinyint, + `test_int` int + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='6', + 'teradata.char.charset'='UNICODE', + 'teradata.row.length'='1MB' +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@teradata_binary_table_1mb_insert +POSTHOOK: query: CREATE TABLE `teradata_binary_table_1mb_insert`( + `test_tinyint` tinyint, + `test_int` int + ) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde' +STORED AS INPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat' +OUTPUTFORMAT + 'org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat' +TBLPROPERTIES ( + 'teradata.timestamp.precision'='6', + 'teradata.char.charset'='UNICODE', + 'teradata.row.length'='1MB' +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@teradata_binary_table_1mb_insert +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/teradata_binary_table.deflate' OVERWRITE INTO TABLE teradata_binary_table_64kb +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@teradata_binary_table_64kb +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/teradata_binary_table.deflate' OVERWRITE INTO TABLE teradata_binary_table_64kb +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@teradata_binary_table_64kb +PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/td_data_with_1mb_rowsize.teradata.gz' OVERWRITE INTO TABLE teradata_binary_table_1mb +PREHOOK: type: LOAD +#### A masked pattern was here #### +PREHOOK: Output: default@teradata_binary_table_1mb +POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/teradata_binary_file/td_data_with_1mb_rowsize.teradata.gz' OVERWRITE INTO TABLE teradata_binary_table_1mb +POSTHOOK: type: LOAD +#### A masked pattern was here #### +POSTHOOK: Output: default@teradata_binary_table_1mb +PREHOOK: query: SELECT * from teradata_binary_table_64kb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +POSTHOOK: query: SELECT * from teradata_binary_table_64kb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +10 34 139997714 32307660 18.6717 59.99 2018-08-23 2018-07-23 01:45:55 A NULL NULL +10 28 89082024 53367308 5.9069 27.90 2018-08-23 2018-07-23 19:45:36 A NULL NULL +10 31 65499801 9495835 5.9064 29.99 2018-08-23 2018-07-23 09:15:10 A NULL NULL +10 20 144923884 123337561 20.1037 50.50 2018-08-23 2018-07-23 22:49:52 A NULL NULL +10 9 118474716 110462827 18.6697 29.99 2018-08-23 2018-07-23 10:13:03 A NULL NULL +10 4 116098596 555556155 20.1017 29.99 2018-07-23 2018-07-23 13:12:10 X SELF_SERVICE SELF_SERVICE +10 10 84492975 100052093 15.4913 29.99 2018-08-23 2018-07-23 17:56:32 A NULL NULL +10 31 101314613 45413087 5.9064 29.99 2018-08-23 2018-07-23 11:26:24 A NULL NULL +10 1 156962113 554297748 NULL 29.99 2018-08-23 2018-07-23 11:31:31 A NULL NULL +10 10 92560875 380929783 20.1011 20.91 2018-07-30 2018-07-23 05:02:42 S RCHARGE_FAILURE RCHARGE_FAILURE +10 5 154490193 186062438 20.1037 29.99 2018-07-23 2018-07-23 10:17:20 X NULL NULL +10 31 2954435 34009387 0.0214 24.23 2018-08-23 2018-07-23 15:46:21 A NULL NULL +10 4 156942563 55362740 0.0024 29.99 2018-08-23 2018-07-23 08:16:49 A NULL NULL +10 31 90527523 126581551 7.5689 59.99 2018-08-23 2018-07-23 03:40:28 A NULL NULL +10 1 118477496 598803186 NULL 29.99 2018-08-23 2018-07-23 10:45:28 A NULL NULL +10 75 137653654 38440942 20.1037 29.99 2018-08-23 2018-07-23 19:01:04 A NULL NULL +10 2 142697304 106829658 20.1008 24.21 2018-07-23 2018-07-23 05:22:17 S RCHARGE_FAILURE RCHARGE_FAILURE +10 14 134043823 264156349 20.1008 24.21 2018-08-23 2018-07-23 12:12:48 A NULL NULL +10 7 91359485 7008957 20.1011 20.91 2018-08-23 2018-07-23 23:42:04 A NULL NULL +10 1 118512426 222159750 NULL 29.99 2018-08-23 2018-07-23 17:06:25 A NULL NULL +10 5 155168873 135968937 18.6697 59.99 2018-07-30 2018-07-23 18:01:35 S RCHARGE_FAILURE RCHARGE_FAILURE +10 4 151084943 38355275 20.1017 29.99 2018-08-23 2018-07-23 04:12:32 A NULL NULL +10 6 118452556 90264779 20.1017 59.99 2018-08-23 2018-07-23 05:18:44 A NULL NULL +10 31 53127101 18622653 0.0115 49.95 2018-08-23 2018-07-23 07:38:05 A NULL NULL +10 1 118479736 216825119 NULL 29.99 2018-08-23 2018-07-23 11:11:51 A NULL NULL +10 4 142708764 21984202 30.5785 27.50 2018-08-23 2018-07-23 10:36:22 A NULL NULL +10 4 142713364 33598449 20.1017 29.99 2018-07-23 2018-07-23 12:49:24 X SELF_SERVICE SELF_SERVICE +10 22 103578546 152144452 20.1017 29.99 2018-08-23 2018-07-23 11:18:44 A NULL NULL +10 22 111233194 69051 20.1017 29.99 2018-08-23 2018-07-23 08:58:16 A NULL NULL +10 12 132376034 2651098 20.1017 29.99 2018-08-23 2018-07-23 06:01:44 A NULL NULL +10 11 135778714 29866847 18.6717 59.99 2018-08-23 2018-07-23 02:35:58 A NULL NULL +10 10 118525066 34556421 5.9064 29.99 2018-08-23 2018-07-23 21:15:29 A NULL NULL +10 7 144897784 532208226 20.1017 29.99 2018-08-23 2018-07-23 14:35:42 A NULL NULL +10 34 87091713 93626084 5.9064 29.99 2018-08-23 2018-07-23 08:56:25 A NULL NULL +10 21 129323704 14298869 30.5516 55.03 2018-08-23 2018-07-23 05:48:14 A NULL NULL +10 31 112813163 36762074 5.9064 29.99 2018-08-23 2018-07-23 18:07:23 A NULL NULL +10 1 156980833 58308375 NULL 59.99 2018-08-23 2018-07-23 14:54:17 A NULL NULL +10 5 150357953 101207194 20.1017 29.99 2018-08-14 2018-07-23 13:53:14 S NULL NULL +10 1 118462836 668498576 NULL 55.03 2018-08-23 2018-07-23 07:44:11 A NULL NULL +10 7 129423664 312394041 20.1017 29.99 2018-08-23 2018-07-23 20:40:42 A NULL NULL +10 10 122518074 5448199 20.1017 29.99 2018-08-23 2018-07-23 01:30:03 A NULL NULL +10 3 113469566 593079639 20.1037 29.99 2018-08-23 2018-07-23 19:39:05 A NULL NULL +10 4 144878314 88960410 18.6689 55.03 2018-08-23 2018-07-23 11:43:56 A NULL NULL +10 8 146831593 168164335 30.5786 28.03 2018-08-23 2018-07-23 11:34:36 A NULL NULL +10 4 91358385 23752815 29.9896 27.21 2018-08-23 2018-07-23 23:20:30 A NULL NULL +10 3 118533306 286487393 30.5529 44.02 2019-07-23 2018-07-23 23:48:14 A NULL NULL +10 7 103618686 339052539 18.6697 59.99 2018-08-23 2018-07-23 18:26:54 A NULL NULL +10 11 92556375 196464425 29.9896 27.21 2018-08-23 2018-07-23 03:15:07 A NULL NULL +10 11 137563254 239883707 18.6697 59.99 2018-08-23 2018-07-23 02:01:31 A NULL NULL +10 2 116078336 61997052 20.1017 29.99 2018-07-23 2018-07-23 00:55:05 X SELF_SERVICE SELF_SERVICE +PREHOOK: query: SELECT * from teradata_binary_table_1mb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +POSTHOOK: query: SELECT * from teradata_binary_table_1mb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +-6 0 -99999 -1 NULL 0.00 2011-01-02 2009-02-28 12:34:56 数 AABBCC +5 3200 -9999 NULL 3.14159 314000000.00 NULL 2011-02-28 12:34:56 ABC NULL +-127 32000 -9 1234567890123456789 2.01E10 3.14 2011-01-02 2022-02-28 12:34:56 数 ありがとうございます � 7��c� +-1 -32000 0 123456789012345678 2.0108E10 314.15 0001-12-31 NULL A thank you � 7��c� +127 32767 1 999000 2.034E12 0.04 2099-01-02 NULL I � 7��c� +2 -32767 9 987654321098765432 2.019876E12 NULL 2011-01-02 NULL あ test NULL +3 32 99 -1234567890123456789 2.0E12 3140000000000.00 2999-12-31 0001-12-28 12:34:56 ? *** � 7��c� +-127 32000 100 1234567890123456789 2.01E10 3.14 2011-01-02 2022-02-28 12:34:56 数 ありがとうございます � 7��c� +-1 -32000 101 123456789012345678 2.0108E10 314.15 2009-09-09 NULL A thank you � 7��c� +127 32767 102 999000 2.034E12 0.04 2011-01-02 NULL I � 7��c� +2 -32767 103 987654321098765432 2.019876E12 NULL 2011-01-02 NULL あ test NULL +3 32 104 -1234567890123456789 2.01E10 3.14 2011-01-02 0001-12-28 12:34:56 ? * � 7��c� +-4 320 105 0 2.01E10 3.14 2011-01-02 2010-02-28 12:34:56 NULL ||ありがとうございます|| � 7��c� +5 3200 106 NULL 3.14159 3.14 2011-01-02 2011-02-28 12:34:56 ABC NULL +-6 0 107 -1 NULL 0.00 2011-01-02 2009-02-28 12:34:56 数 AABBCC +7 NULL 108 65536 2.01E-8 NULL NULL 2099-02-28 12:34:56 数 NULL � 7��c� +NULL 1 109 256 1.01E18 12.00 2011-01-02 2999-12-31 12:34:56 数 NULL � 7��c� +-4 320 999 0 2.01E10 3.14 2011-01-02 2010-02-28 12:34:56 NULL ||ありがとうございます|| � 7��c� +NULL 1 1234 256 1.01E18 12.00 2000-01-02 2999-12-31 12:34:56 数 NULL � 7��c� +7 NULL 999999 65536 2.01E-8 NULL NULL 2099-02-28 12:34:56 数 NULL � 7��c� +PREHOOK: query: SELECT COUNT(*) FROM teradata_binary_table_64kb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +POSTHOOK: query: SELECT COUNT(*) FROM teradata_binary_table_64kb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +50 +PREHOOK: query: SELECT COUNT(*) FROM teradata_binary_table_1mb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +POSTHOOK: query: SELECT COUNT(*) FROM teradata_binary_table_1mb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +20 +PREHOOK: query: SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_64kb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +POSTHOOK: query: SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_64kb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +2018 +PREHOOK: query: SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_64kb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +POSTHOOK: query: SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_64kb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +2019 +PREHOOK: query: SELECT max(Floor(test_decimal)) FROM teradata_binary_table_64kb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +POSTHOOK: query: SELECT max(Floor(test_decimal)) FROM teradata_binary_table_64kb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +59 +PREHOOK: query: SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_1mb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +POSTHOOK: query: SELECT max(date_format(test_timestamp, 'y')) FROM teradata_binary_table_1mb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +2999 +PREHOOK: query: SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_1mb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +POSTHOOK: query: SELECT max(date_format(test_date, 'y')) FROM teradata_binary_table_1mb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +2999 +PREHOOK: query: SELECT max(Floor(test_decimal)) FROM teradata_binary_table_1mb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +POSTHOOK: query: SELECT max(Floor(test_decimal)) FROM teradata_binary_table_1mb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +3140000000000 +PREHOOK: query: SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_64kb GROUP BY test_tinyint +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +POSTHOOK: query: SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_64kb GROUP BY test_tinyint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_64kb +#### A masked pattern was here #### +10 59.99 +PREHOOK: query: SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_1mb GROUP BY test_tinyint +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +POSTHOOK: query: SELECT test_tinyint, MAX(test_decimal) FROM teradata_binary_table_1mb GROUP BY test_tinyint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_1mb +#### A masked pattern was here #### +NULL 12.00 +-127 3.14 +-6 0.00 +-4 3.14 +-1 314.15 +2 NULL +3 3140000000000.00 +5 314000000.00 +7 NULL +127 0.04 +PREHOOK: query: INSERT OVERWRITE TABLE teradata_binary_table_64kb_insert +SELECT test_tinyint, test_decimal, test_date, test_timestamp FROM teradata_binary_table_64kb +PREHOOK: type: QUERY +PREHOOK: Input: default@teradata_binary_table_64kb +PREHOOK: Output: default@teradata_binary_table_64kb_insert +POSTHOOK: query: INSERT OVERWRITE TABLE teradata_binary_table_64kb_insert +SELECT test_tinyint, test_decimal, test_date, test_timestamp FROM teradata_binary_table_64kb +POSTHOOK: type: QUERY +POSTHOOK: Input: default@teradata_binary_table_64kb +POSTHOOK: Output: default@teradata_binary_table_64kb_insert +POSTHOOK: Lineage: teradata_binary_table_64kb_insert.test_date SIMPLE [(teradata_binary_table_64kb)teradata_binary_table_64kb.FieldSchema(name:test_date, type:date, comment:from deserializer), ] +POSTHOOK: Lineage: teradata_binary_table_64kb_insert.test_decimal SIMPLE [(teradata_binary_table_64kb)teradata_binary_table_64kb.FieldSchema(name:test_decimal, type:decimal(15,2), comment:from deserializer), ] +POSTHOOK: Lineage: teradata_binary_table_64kb_insert.test_timestamp SIMPLE [(teradata_binary_table_64kb)teradata_binary_table_64kb.FieldSchema(name:test_timestamp, type:timestamp, comment:from deserializer), ] +POSTHOOK: Lineage: teradata_binary_table_64kb_insert.test_tinyint SIMPLE [(teradata_binary_table_64kb)teradata_binary_table_64kb.FieldSchema(name:test_tinyint, type:tinyint, comment:from deserializer), ] +PREHOOK: query: INSERT OVERWRITE TABLE teradata_binary_table_1mb_insert +SELECT 1, 15 +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@teradata_binary_table_1mb_insert +POSTHOOK: query: INSERT OVERWRITE TABLE teradata_binary_table_1mb_insert +SELECT 1, 15 +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@teradata_binary_table_1mb_insert +POSTHOOK: Lineage: teradata_binary_table_1mb_insert.test_int SIMPLE [] +POSTHOOK: Lineage: teradata_binary_table_1mb_insert.test_tinyint EXPRESSION [] +PREHOOK: query: DESC FORMATTED teradata_binary_table_64kb_insert +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@teradata_binary_table_64kb_insert +POSTHOOK: query: DESC FORMATTED teradata_binary_table_64kb_insert +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@teradata_binary_table_64kb_insert +# col_name data_type comment +test_tinyint tinyint from deserializer +test_decimal decimal(15,2) from deserializer +test_date date from deserializer +test_timestamp timestamp from deserializer + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + bucketing_version 2 + numFiles 1 + numRows 50 + rawDataSize 0 + teradata.char.charset LATIN + teradata.row.length 64KB + teradata.timestamp.precision 0 + totalSize 1800 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde +InputFormat: org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: DESC FORMATTED teradata_binary_table_1mb_insert +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@teradata_binary_table_1mb_insert +POSTHOOK: query: DESC FORMATTED teradata_binary_table_1mb_insert +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@teradata_binary_table_1mb_insert +# col_name data_type comment +test_tinyint tinyint from deserializer +test_int int from deserializer + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"test_int\":\"true\",\"test_tinyint\":\"true\"}} + bucketing_version 2 + numFiles 1 + numRows 1 + rawDataSize 0 + teradata.char.charset UNICODE + teradata.row.length 1MB + teradata.timestamp.precision 6 + totalSize 11 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde +InputFormat: org.apache.hadoop.hive.ql.io.TeradataBinaryFileInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.TeradataBinaryFileOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: DROP TABLE if exists teradata_binary_table_64kb +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@teradata_binary_table_64kb +PREHOOK: Output: default@teradata_binary_table_64kb +POSTHOOK: query: DROP TABLE if exists teradata_binary_table_64kb +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@teradata_binary_table_64kb +POSTHOOK: Output: default@teradata_binary_table_64kb +PREHOOK: query: DROP TABLE if exists teradata_binary_table_1mb +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@teradata_binary_table_1mb +PREHOOK: Output: default@teradata_binary_table_1mb +POSTHOOK: query: DROP TABLE if exists teradata_binary_table_1mb +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@teradata_binary_table_1mb +POSTHOOK: Output: default@teradata_binary_table_1mb +PREHOOK: query: DROP TABLE if exists teradata_binary_table_64kb_insert +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@teradata_binary_table_64kb_insert +PREHOOK: Output: default@teradata_binary_table_64kb_insert +POSTHOOK: query: DROP TABLE if exists teradata_binary_table_64kb_insert +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@teradata_binary_table_64kb_insert +POSTHOOK: Output: default@teradata_binary_table_64kb_insert +PREHOOK: query: DROP TABLE if exists teradata_binary_table_1mb_insert +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@teradata_binary_table_1mb_insert +PREHOOK: Output: default@teradata_binary_table_1mb_insert +POSTHOOK: query: DROP TABLE if exists teradata_binary_table_1mb_insert +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@teradata_binary_table_1mb_insert +POSTHOOK: Output: default@teradata_binary_table_1mb_insert diff --git serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataInputStream.java serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataInputStream.java new file mode 100644 index 0000000000..b26d3422f9 --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataInputStream.java @@ -0,0 +1,199 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.teradata; + +import org.apache.commons.io.input.SwappedDataInputStream; +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.Timestamp; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.math.BigInteger; +import java.text.ParseException; + +import static java.lang.String.format; + +/** + * The TeradataBinaryDataInputStream is used to handle the Teradata binary format input for record. + * Since the TD binary format uses little-endian to handle the SHORT, INT, LONG, DOUBLE and etc. + * while the Hadoop uses big-endian, + * We extend SwappedDataInputStream to handle these types and extend to handle the Teradata + * specific types like VARCHAR, CHAR, TIMESTAMP, DATE... + */ +public class TeradataBinaryDataInputStream extends SwappedDataInputStream { + + private static final int DATE_STRING_LENGTH = 8; + + /** + * Instantiates a new Teradata binary data input stream. + * + * @param input the input + */ + public TeradataBinaryDataInputStream(InputStream input) { + super(input); + } + + /** + * Read VARCHAR(N). + * The representation of Varchar in Teradata binary format is: + * the first two bytes represent the length N of this varchar field, + * the next N bytes represent the content of this varchar field. + * To pad the null varchar, the length will be 0 and the content will be none. + * + * @return the string + * @throws IOException the io exception + */ + public String readVarchar() throws IOException { + int varcharLength = readUnsignedShort(); + byte[] varcharContent = new byte[varcharLength]; + int numOfBytesRead = in.read(varcharContent); + if (varcharContent.length != 0 && numOfBytesRead != varcharLength) { + throw new EOFException( + format("Fail to read the varchar. Expect %d bytes, get %d bytes", varcharLength, numOfBytesRead)); + } + //force it to be UTF8 string + return new String(varcharContent, "UTF8"); + } + + /** + * Read TIMESTAMP(P). + * The representation of timestamp in Teradata binary format is: + * the byte number to read is based on the precision of timestamp, + * each byte represents one char and the timestamp is using string representation, + * eg: for TIMESTAMP(6), we need to read 26 bytes + * 31 39 31 31 2d 31 31 2d 31 31 20 31 39 3a 32 30 3a 32 31 2e 34 33 33 32 30 30 + * will represent 1911-11-11 19:20:21.433200. + * the null timestamp will use space to pad. + * + * @param byteNum the byte number that will be read from inputstream + * @return the timestamp + * @throws IOException the io exception + */ + public Timestamp readTimestamp(Integer byteNum) throws IOException { + // yyyy-mm-dd hh:mm:ss + byte[] timestampContent = new byte[byteNum]; + int numOfBytesRead = in.read(timestampContent); + if (timestampContent.length != 0 && numOfBytesRead != byteNum) { + throw new EOFException( + format("Fail to read the timestamp. Expect %d bytes, get %d bytes", byteNum, numOfBytesRead)); + } + String timestampStr = new String(timestampContent, "UTF8"); + if (timestampStr.trim().length() == 0) { + return null; + } + return Timestamp.valueOf(timestampStr); + } + + /** + * Read DATE. + * The representation of date in Teradata binary format is: + * The Date D is a int with 4 bytes using little endian, + * The representation is (D+19000000).ToString -> YYYYMMDD, + * eg: Date 07 b2 01 00 -> 111111 in little endian -> 19111111 - > 1911.11.11. + * the null date will use 0 to pad. + * + * @return the date + * @throws IOException the io exception + * @throws ParseException the parse exception + */ + public Date readDate() throws IOException, ParseException { + int di = readInt(); + if (di == 0) { + return null; + } + String dateString = String.valueOf(di + 19000000); + if (dateString.length() < DATE_STRING_LENGTH) { + dateString = StringUtils.leftPad(dateString, DATE_STRING_LENGTH, '0'); + } + Date date = new Date(); + date.setYear(Integer.parseInt(dateString.substring(0, 4))); + date.setMonth(Integer.parseInt(dateString.substring(4, 6))); + date.setDayOfMonth(Integer.parseInt(dateString.substring(6, 8))); + return date; + } + + /** + * Read CHAR(N). + * The representation of char in Teradata binary format is + * the byte number to read is based on the [charLength] * [bytePerChar] <- totalLength, + * bytePerChar is decided by the charset: LATAIN charset is 2 bytes per char and UNICODE charset is 3 bytes per char. + * the null char will use space to pad. + * + * @param totalLength the total length + * @return the string + * @throws IOException the io exception + */ + public String readChar(int totalLength) throws IOException { + byte[] charContent = new byte[totalLength]; + int numOfBytesRead = in.read(charContent); + if (charContent.length != 0 && numOfBytesRead != totalLength) { + throw new EOFException( + format("Fail to read the varchar. Expect %d bytes, get %d bytes", totalLength, numOfBytesRead)); + } + return new String(charContent, "UTF8"); + } + + /** + * Read DECIMAL(P, S). + * The representation of decimal in Teradata binary format is + * the byte number to read is decided solely by the precision(P), + * HiveDecimal is constructed through the byte array and scale. + * the null DECIMAL will use 0x00 to pad. + * + * @param scale the scale + * @param byteNum the byte num + * @return the hive decimal + * @throws IOException the io exception + */ + public HiveDecimal readDecimal(int scale, int byteNum) throws IOException { + byte[] decimalContent = new byte[byteNum]; + int numOfBytesRead = in.read(decimalContent); + if (decimalContent.length != 0 && numOfBytesRead != byteNum) { + throw new EOFException( + format("Fail to read the decimal. Expect %d bytes, get %d bytes", byteNum, numOfBytesRead)); + } + ArrayUtils.reverse(decimalContent); + return HiveDecimal.create(new BigInteger(decimalContent), scale); + } + + /** + * Read VARBYTE(N). + * The representation of VARBYTE in Teradata binary format is: + * the first two bytes represent the length N of this varchar field + * the next N bytes represent the content of this varchar field. + * To pad the null varbyte, the length will be 0 and the content will be none. + * + * @return the byte [ ] + * @throws IOException the io exception + */ + public byte[] readVarbyte() throws IOException { + int varbyteLength = readUnsignedShort(); + byte[] varbyteContent = new byte[varbyteLength]; + int numOfBytesRead = in.read(varbyteContent); + if (varbyteContent.length != 0 && numOfBytesRead != varbyteLength) { + throw new EOFException( + format("Fail to read the varbyte. Expect %d bytes, get %d bytes", varbyteLength, numOfBytesRead)); + } + return varbyteContent; + } +} diff --git serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataOutputStream.java serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataOutputStream.java new file mode 100644 index 0000000000..f2f801dc8f --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinaryDataOutputStream.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.teradata; + +import org.apache.commons.io.EndianUtils; +import org.apache.commons.io.output.ByteArrayOutputStream; +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.serde2.io.DateWritableV2; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; + +import java.io.IOException; +import java.math.BigInteger; +import java.util.Arrays; +import java.util.Collections; + +import static java.lang.String.join; +import static java.lang.String.format; + + +/** + * The TeradataBinaryDataOutputStream is used to produce the output in compliance with the Teradata binary format, + * so the output can be directly used to load into Teradata DB using TPT fastload. + * Since the TD binary format uses little-endian to handle the SHORT, INT, LONG, DOUBLE and etc. + * while the Hadoop uses big-endian, + * We extend SwappedDataInputStream to return qualified bytes for these types and extend to handle the Teradata + * specific types like VARCHAR, CHAR, TIMESTAMP, DATE... + */ +public class TeradataBinaryDataOutputStream extends ByteArrayOutputStream { + + private static final Log LOG = LogFactory.getLog(TeradataBinaryDataOutputStream.class); + + private static final int TIMESTAMP_NO_NANOS_BYTE_NUM = 19; + + public TeradataBinaryDataOutputStream() { + } + + /** + * Write VARCHAR(N). + * The representation of Varchar in Teradata binary format is: + * the first two bytes represent the length N of this varchar field, + * the next N bytes represent the content of this varchar field. + * To pad the null varchar, the length will be 0 and the content will be none. + * + * @param writable the writable + * @throws IOException the io exception + */ + public void writeVarChar(HiveVarcharWritable writable) throws IOException { + if (writable == null) { + EndianUtils.writeSwappedShort(this, (short) 0); + return; + } + Text t = writable.getTextValue(); + int varcharLength = t.getLength(); + EndianUtils.writeSwappedShort(this, (short) varcharLength); // write the varchar length + write(t.getBytes(), 0, varcharLength); // write the varchar content + } + + /** + * Write INT. + * using little-endian to write integer. + * + * @param i the + * @throws IOException the io exception + */ + public void writeInt(int i) throws IOException { + EndianUtils.writeSwappedInteger(this, i); + } + + /** + * Write TIMESTAMP(N). + * The representation of timestamp in Teradata binary format is: + * the byte number to read is based on the precision of timestamp, + * each byte represents one char and the timestamp is using string representation, + * eg: for 1911-11-11 19:20:21.433200 in TIMESTAMP(3), we will cut it to be 1911-11-11 19:20:21.433 and write + * 31 39 31 31 2d 31 31 2d 31 31 20 31 39 3a 32 30 3a 32 31 2e 34 33 33. + * the null timestamp will use space to pad. + * + * @param timestamp the timestamp + * @param byteNum the byte number the timestamp will write + * @throws IOException the io exception + */ + public void writeTimestamp(TimestampWritableV2 timestamp, int byteNum) throws IOException { + if (timestamp == null) { + String pad = join("", Collections.nCopies(byteNum, " ")); + write(pad.getBytes("UTF8")); + return; + } + String sTimeStamp = timestamp.getTimestamp().toString(); + if (sTimeStamp.length() >= byteNum) { + write(sTimeStamp.substring(0, byteNum).getBytes("UTF8")); + return; + } + write(sTimeStamp.getBytes("UTF8")); + String pad; + if (sTimeStamp.length() == TIMESTAMP_NO_NANOS_BYTE_NUM) { + pad = "." + join("", Collections.nCopies(byteNum - sTimeStamp.length() - 1, "0")); + } else { + pad = join("", Collections.nCopies(byteNum - sTimeStamp.length(), "0")); + } + write(pad.getBytes("UTF8")); + } + + /** + * Write DOUBLE. + * using little-endian to write double. + * + * @param d the d + * @throws IOException the io exception + */ + public void writeDouble(double d) throws IOException { + EndianUtils.writeSwappedDouble(this, d); + } + + /** + * Write DATE. + * The representation of date in Teradata binary format is: + * The Date D is a int with 4 bytes using little endian. + * The representation is (YYYYMMDD - 19000000).toInt -> D + * eg. 1911.11.11 -> 19111111 -> 111111 -> 07 b2 01 00 in little endian. + * the null date will use 0 to pad. + * + * @param date the date + * @throws IOException the io exception + */ + public void writeDate(DateWritableV2 date) throws IOException { + if (date == null) { + EndianUtils.writeSwappedInteger(this, 0); + return; + } + int toWrite = date.get().getYear() * 10000 + date.get().getMonth() * 100 + date.get().getDay() - 19000000; + EndianUtils.writeSwappedInteger(this, toWrite); + } + + /** + * Write LONG. + * using little-endian to write double. + * + * @param l the l + * @throws IOException the io exception + */ + public void writeLong(long l) throws IOException { + EndianUtils.writeSwappedLong(this, l); + } + + /** + * Write CHAR(N). + * The representation of char in Teradata binary format is: + * the byte number to read is based on the [charLength] * [bytePerChar] <- totalLength, + * bytePerChar is decided by the charset: LATAIN charset is 2 bytes per char and UNICODE charset is 3 bytes per char. + * the null char will use space to pad. + * + * @param writable the writable + * @param length the byte n + * @throws IOException the io exception + */ + public void writeChar(HiveCharWritable writable, int length) throws IOException { + if (writable == null) { + String pad = join("", Collections.nCopies(length, " ")); + write(pad.getBytes("UTF8")); + return; + } + Text t = writable.getStrippedValue(); + int contentLength = t.getLength(); + write(t.getBytes(), 0, contentLength); + if (length - contentLength < 0) { + throw new IOException(format("The byte num %s of HiveCharWritable is more than the byte num %s we can hold. " + + "The content of HiveCharWritable is %s", contentLength, length, writable.getPaddedValue())); + } + if (length > contentLength) { + String pad = join("", Collections.nCopies(length - contentLength, " ")); + write(pad.getBytes("UTF8")); + } + } + + /** + * Write DECIMAL(P, S). + * The representation of decimal in Teradata binary format is: + * the byte number to read is decided solely by the precision(P), + * HiveDecimal is constructed through the byte array and scale. + * the rest of byte will use 0x00 to pad (positive) and use 0xFF to pad (negative). + * the null DECIMAL will use 0x00 to pad. + * + * @param writable the writable + * @param byteNum the byte num + * @throws IOException the io exception + */ + public void writeDecimal(HiveDecimalWritable writable, int byteNum, int scale) throws IOException { + if (writable == null) { + byte[] pad = new byte[byteNum]; + write(pad); + return; + } + // since the HiveDecimal will auto adjust the scale to save resource + // we need to adjust it back otherwise the output bytes will be wrong + int hiveScale = writable.getHiveDecimal().scale(); + BigInteger bigInteger = writable.getHiveDecimal().unscaledValue(); + if (hiveScale < scale) { + BigInteger multiplicand = new BigInteger("1" + join("", Collections.nCopies(scale - hiveScale, "0"))); + bigInteger = bigInteger.multiply(multiplicand); + } + byte[] content = bigInteger.toByteArray(); + int signBit = content[0] >> 7 & 1; + ArrayUtils.reverse(content); + write(content); + if (byteNum > content.length) { + byte[] pad; + if (signBit == 0) { + pad = new byte[byteNum - content.length]; + } else { + pad = new byte[byteNum - content.length]; + Arrays.fill(pad, (byte) 255); + } + write(pad); + } + } + + /** + * Write SHORT. + * using little-endian to write short. + * + * @param s the s + * @throws IOException the io exception + */ + public void writeShort(short s) throws IOException { + EndianUtils.writeSwappedShort(this, s); + } + + /** + * Write VARBYTE(N). + * The representation of VARBYTE in Teradata binary format is: + * the first two bytes represent the length N of this varchar field, + * the next N bytes represent the content of this varchar field. + * To pad the null varbyte, the length will be 0 and the content will be none. + * + * @param writable the writable + * @throws IOException the io exception + */ + public void writeVarByte(BytesWritable writable) throws IOException { + if (writable == null) { + EndianUtils.writeSwappedShort(this, (short) 0); + return; + } + int varbyteLength = writable.getLength(); + EndianUtils.writeSwappedShort(this, (short) varbyteLength); // write the varbyte length + write(writable.getBytes(), 0, varbyteLength); // write the varchar content + } +} diff --git serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinarySerde.java serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinarySerde.java new file mode 100644 index 0000000000..ccf5f44cfd --- /dev/null +++ serde/src/java/org/apache/hadoop/hive/serde2/teradata/TeradataBinarySerde.java @@ -0,0 +1,597 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.teradata; + +import com.google.common.collect.ImmutableMap; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DateWritableV2; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeSpec; +import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.hive.common.type.Date; + +import javax.annotation.Nullable; +import java.io.ByteArrayInputStream; +import java.io.EOFException; +import java.io.IOException; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Properties; + +import static java.lang.String.format; + +/** + * https://cwiki.apache.org/confluence/display/Hive/TeradataBinarySerde. + * TeradataBinarySerde handles the serialization and deserialization of Teradata Binary Record + * passed from TeradataBinaryRecordReader. + * + * The Teradata Binary Record uses little-endian to handle the SHORT, INT, LONG, DOUBLE... + * We extend SwappedDataInputStream to handle these types and extend to handle the Teradata + * specific types like VARCHAR, CHAR, TIMESTAMP, DATE... + * + * Currently we support 11 Teradata data types: VARCHAR ,INTEGER, TIMESTAMP, FLOAT, DATE, + * BYTEINT, BIGINT, CHARACTER, DECIMAL, SMALLINT, VARBYTE. + * The mapping between Teradata data type and Hive data type is + * Teradata Data Type: Hive Data Type + * VARCHAR: VARCHAR, + * INTEGER: INT, + * TIMESTAMP: TIMESTAMP, + * FLOAT: DOUBLE, + * DATE: DATE, + * BYTEINT: TINYINT , + * BIGINT: BIGINT, + * CHAR: CHAR, + * DECIMAL: DECIMAL, + * SMALLINT: SMALLINT, + * VARBYTE: BINARY. + * + * TeradataBinarySerde currently doesn't support complex types like MAP, ARRAY and STRUCT. + */ +@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, + serdeConstants.LIST_COLUMN_TYPES }) public class TeradataBinarySerde extends AbstractSerDe { + private static final Log LOG = LogFactory.getLog(TeradataBinarySerde.class); + + public static final String TD_SCHEMA_LITERAL = "teradata.schema.literal"; + + private StructObjectInspector rowOI; + private ArrayList row; + private byte[] inForNull; + + private int numCols; + private List columnNames; + private List columnTypes; + + private TeradataBinaryDataOutputStream out; + private BytesWritable serializeBytesWritable; + private byte[] outForNull; + + public static final String TD_TIMESTAMP_PRECISION = "teradata.timestamp.precision"; + private int timestampPrecision; + private static final int DEFAULT_TIMESTAMP_BYTE_NUM = 19; + private static final String DEFAULT_TIMESTAMP_PRECISION = "6"; + + public static final String TD_CHAR_SET = "teradata.char.charset"; + private String charCharset; + private static final String DEFAULT_CHAR_CHARSET = "UNICODE"; + private static final Map CHARSET_TO_BYTE_NUM = ImmutableMap.of("LATIN", 2, "UNICODE", 3); + + /** + * Initialize the HiveSerializer. + * + * @param conf + * System properties. Can be null in compile time + * @param tbl + * table properties + * @throws SerDeException + */ + @Override public void initialize(@Nullable Configuration conf, Properties tbl) throws SerDeException { + columnNames = Arrays.asList(tbl.getProperty(serdeConstants.LIST_COLUMNS).split(",")); + + String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); + LOG.debug(serdeConstants.LIST_COLUMN_TYPES + ": " + columnTypeProperty); + if (columnTypeProperty.length() == 0) { + columnTypes = new ArrayList(); + } else { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + } + + assert columnNames.size() == columnTypes.size(); + numCols = columnNames.size(); + + // get the configured teradata timestamp precision + // you can configure to generate timestamp of different precision in the binary file generated by TPT/BTEQ + timestampPrecision = Integer.parseInt(tbl.getProperty(TD_TIMESTAMP_PRECISION, DEFAULT_TIMESTAMP_PRECISION)); + + // get the configured teradata char charset + // in TD, latin charset will have 2 bytes per char and unicode will have 3 bytes per char + charCharset = tbl.getProperty(TD_CHAR_SET, DEFAULT_CHAR_CHARSET); + if (!CHARSET_TO_BYTE_NUM.containsKey(charCharset)) { + throw new SerDeException( + format("%s isn't supported in Teradata Char Charset %s", charCharset, CHARSET_TO_BYTE_NUM.keySet())); + } + + // All columns have to be primitive. + // Constructing the row ObjectInspector: + List columnOIs = new ArrayList(numCols); + for (int i = 0; i < numCols; i++) { + if (columnTypes.get(i).getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new SerDeException( + getClass().getName() + " only accepts primitive columns, but column[" + i + "] named " + columnNames.get(i) + + " has category " + columnTypes.get(i).getCategory()); + } + columnOIs.add(TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(columnTypes.get(i))); + } + + rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs); + + // Constructing the row object and will be reused for all rows + row = new ArrayList(numCols); + for (int i = 0; i < numCols; i++) { + row.add(null); + } + + // Initialize vars related to Null Array which represents the null bitmap + int byteNumForNullArray = (numCols / 8) + ((numCols % 8 == 0) ? 0 : 1); + LOG.debug(format("The Null Bytes for each record will have %s bytes", byteNumForNullArray)); + inForNull = new byte[byteNumForNullArray]; + + out = new TeradataBinaryDataOutputStream(); + serializeBytesWritable = new BytesWritable(); + outForNull = new byte[byteNumForNullArray]; + } + + /** + * Returns the Writable class that would be returned by the serialize method. + * This is used to initialize SequenceFile header. + */ + @Override public Class getSerializedClass() { + return ByteWritable.class; + } + + /** + * Serialize an object by navigating inside the Object with the + * ObjectInspector. In most cases, the return value of this function will be + * constant since the function will reuse the Writable object. If the client + * wants to keep a copy of the Writable, the client needs to clone the + * returned value. + + * @param obj + * @param objInspector + */ + @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { + try { + out.reset(); + final StructObjectInspector outputRowOI = (StructObjectInspector) objInspector; + final List fieldRefs = outputRowOI.getAllStructFieldRefs(); + + if (fieldRefs.size() != numCols) { + throw new SerDeException( + "Cannot serialize the object because there are " + fieldRefs.size() + " fieldRefs but the table defined " + + numCols + " columns."); + } + + // Fully refresh the Null Array to write into the out + for (int i = 0; i < numCols; i++) { + Object objectForField = outputRowOI.getStructFieldData(obj, fieldRefs.get(i)); + if (objectForField == null) { + outForNull[i / 8] = (byte) (outForNull[i / 8] | (0x01 << (7 - (i % 8)))); + } else { + outForNull[i / 8] = (byte) (outForNull[i / 8] & ~(0x01 << (7 - (i % 8)))); + } + } + out.write(outForNull); + + // serialize each field using FieldObjectInspector + for (int i = 0; i < numCols; i++) { + Object objectForField = outputRowOI.getStructFieldData(obj, fieldRefs.get(i)); + serializeField(objectForField, fieldRefs.get(i).getFieldObjectInspector(), columnTypes.get(i)); + } + + serializeBytesWritable.set(out.toByteArray(), 0, out.size()); + return serializeBytesWritable; + } catch (IOException e) { + throw new SerDeException(e); + } + } + + private void serializeField(Object objectForField, ObjectInspector oi, TypeInfo ti) + throws IOException, SerDeException { + switch (oi.getCategory()) { + case PRIMITIVE: + PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; + switch (poi.getPrimitiveCategory()) { + // Teradata Type: BYTEINT + case BYTE: + ByteObjectInspector boi = (ByteObjectInspector) poi; + byte b = 0; + if (objectForField != null) { + b = boi.get(objectForField); + } + out.write(b); + return; + // Teradata Type: SMALLINT + case SHORT: + ShortObjectInspector spoi = (ShortObjectInspector) poi; + short s = 0; + if (objectForField != null) { + s = spoi.get(objectForField); + } + out.writeShort(s); + return; + // Teradata Type: INT + case INT: + IntObjectInspector ioi = (IntObjectInspector) poi; + int i = 0; + if (objectForField != null) { + i = ioi.get(objectForField); + } + out.writeInt(i); + return; + // Teradata Type: BIGINT + case LONG: + LongObjectInspector loi = (LongObjectInspector) poi; + long l = 0; + if (objectForField != null) { + l = loi.get(objectForField); + } + out.writeLong(l); + return; + // Teradata Type: FLOAT + case DOUBLE: + DoubleObjectInspector doi = (DoubleObjectInspector) poi; + double d = 0; + if (objectForField != null) { + d = doi.get(objectForField); + } + out.writeDouble(d); + return; + // Teradata Type: VARCHAR + case VARCHAR: + HiveVarcharObjectInspector hvoi = (HiveVarcharObjectInspector) poi; + HiveVarcharWritable hv = hvoi.getPrimitiveWritableObject(objectForField); + // assert the length of varchar record fits into the table definition + if (hv != null) { + assert ((VarcharTypeInfo) ti).getLength() >= hv.getHiveVarchar().getCharacterLength(); + } + out.writeVarChar(hv); + return; + // Teradata Type: TIMESTAMP + case TIMESTAMP: + TimestampObjectInspector tsoi = (TimestampObjectInspector) poi; + TimestampWritableV2 ts = tsoi.getPrimitiveWritableObject(objectForField); + out.writeTimestamp(ts, getTimeStampByteNum(timestampPrecision)); + return; + // Teradata Type: DATE + case DATE: + DateObjectInspector dtoi = (DateObjectInspector) poi; + DateWritableV2 dw = dtoi.getPrimitiveWritableObject(objectForField); + out.writeDate(dw); + return; + // Teradata Type: CHAR + case CHAR: + HiveCharObjectInspector coi = (HiveCharObjectInspector) poi; + HiveCharWritable hc = coi.getPrimitiveWritableObject(objectForField); + // assert the length of char record fits into the table definition + if (hc != null) { + assert ((CharTypeInfo) ti).getLength() >= hc.getHiveChar().getCharacterLength(); + } + out.writeChar(hc, getCharByteNum(charCharset) * ((CharTypeInfo) ti).getLength()); + return; + // Teradata Type: DECIMAL + case DECIMAL: + DecimalTypeInfo dtype = (DecimalTypeInfo) ti; + int precision = dtype.precision(); + int scale = dtype.scale(); + HiveDecimalObjectInspector hdoi = (HiveDecimalObjectInspector) poi; + HiveDecimalWritable hd = hdoi.getPrimitiveWritableObject(objectForField); + // assert the precision of decimal record fits into the table definition + if (hd != null) { + assert (dtype.getPrecision() >= hd.precision()); + } + out.writeDecimal(hd, getDecimalByteNum(precision), scale); + return; + // Teradata Type: VARBYTE + case BINARY: + BinaryObjectInspector bnoi = (BinaryObjectInspector) poi; + BytesWritable byw = bnoi.getPrimitiveWritableObject(objectForField); + out.writeVarByte(byw); + return; + default: + throw new SerDeException("Unrecognized type: " + poi.getPrimitiveCategory()); + } + // Currently, serialization of complex types is not supported + case LIST: + case MAP: + case STRUCT: + default: + throw new SerDeException("Unrecognized type: " + oi.getCategory()); + } + } + + @Override public SerDeStats getSerDeStats() { + // no support for statistics + return null; + } + + /** + * Deserialize an object out of a Writable blob. In most cases, the return + * value of this function will be constant since the function will reuse the + * returned object. If the client wants to keep a copy of the object, the + * client needs to clone the returned value by calling + * ObjectInspectorUtils.getStandardObject(). + * + * @param blob + * The Writable object containing a serialized object + * @return A Java object representing the contents in the blob. + */ + @Override public Object deserialize(Writable blob) throws SerDeException { + try { + BytesWritable data = (BytesWritable) blob; + + // initialize the data to be the input stream + TeradataBinaryDataInputStream in = + new TeradataBinaryDataInputStream(new ByteArrayInputStream(data.getBytes(), 0, data.getLength())); + + int numOfByteRead = in.read(inForNull); + + if (inForNull.length != 0 && numOfByteRead != inForNull.length) { + throw new EOFException("not enough bytes for one object"); + } + + boolean isNull; + for (int i = 0; i < numCols; i++) { + // get if the ith field is null or not + isNull = ((inForNull[i / 8] & (128 >> (i % 8))) != 0); + row.set(i, deserializeField(in, columnTypes.get(i), row.get(i), isNull)); + } + + //After deserializing all the fields, the input should be over in which case in.read will return -1 + if (in.read() != -1) { + throw new EOFException("The inputstream has more after we deserialize all the fields - this is unexpected"); + } + } catch (EOFException e) { + LOG.warn("Catch thrown exception", e); + LOG.warn("This record has been polluted. We have reset all the row fields to be null"); + for (int i = 0; i < numCols; i++) { + row.set(i, null); + } + } catch (IOException e) { + throw new SerDeException(e); + } catch (ParseException e) { + throw new SerDeException(e); + } + return row; + } + + private Object deserializeField(TeradataBinaryDataInputStream in, TypeInfo type, Object reuse, boolean isNull) + throws IOException, ParseException, SerDeException { + // isNull: + // In the Teradata Binary file, even the field is null (isNull=true), + // thd data still has some default values to pad the record. + // In this case, you cannot avoid reading the bytes even it is not used. + switch (type.getCategory()) { + case PRIMITIVE: + PrimitiveTypeInfo ptype = (PrimitiveTypeInfo) type; + switch (ptype.getPrimitiveCategory()) { + case VARCHAR: // Teradata Type: VARCHAR + String st = in.readVarchar(); + if (isNull) { + return null; + } else { + HiveVarcharWritable r = reuse == null ? new HiveVarcharWritable() : (HiveVarcharWritable) reuse; + r.set(st, ((VarcharTypeInfo) type).getLength()); + return r; + } + case INT: // Teradata Type: INT + int i = in.readInt(); + if (isNull) { + return null; + } else { + IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse; + r.set(i); + return r; + } + case TIMESTAMP: // Teradata Type: TIMESTAMP + Timestamp ts = in.readTimestamp(getTimeStampByteNum(timestampPrecision)); + if (isNull) { + return null; + } else { + TimestampWritableV2 r = reuse == null ? new TimestampWritableV2() : (TimestampWritableV2) reuse; + r.set(ts); + return r; + } + case DOUBLE: // Teradata Type: FLOAT + double d = in.readDouble(); + if (isNull) { + return null; + } else { + DoubleWritable r = reuse == null ? new DoubleWritable() : (DoubleWritable) reuse; + r.set(d); + return r; + } + case DATE: // Teradata Type: DATE + Date dt = in.readDate(); + if (isNull) { + return null; + } else { + DateWritableV2 r = reuse == null ? new DateWritableV2() : (DateWritableV2) reuse; + r.set(dt); + return r; + } + case BYTE: // Teradata Type: BYTEINT + byte bt = in.readByte(); + if (isNull) { + return null; + } else { + ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable) reuse; + r.set(bt); + return r; + } + case LONG: // Teradata Type: BIGINT + long l = in.readLong(); + if (isNull) { + return null; + } else { + LongWritable r = reuse == null ? new LongWritable() : (LongWritable) reuse; + r.set(l); + return r; + } + case CHAR: // Teradata Type: CHAR + CharTypeInfo ctype = (CharTypeInfo) type; + int length = ctype.getLength(); + String c = in.readChar(length * getCharByteNum(charCharset)); + if (isNull) { + return null; + } else { + HiveCharWritable r = reuse == null ? new HiveCharWritable() : (HiveCharWritable) reuse; + r.set(c, length); + return r; + } + case DECIMAL: // Teradata Type: DECIMAL + DecimalTypeInfo dtype = (DecimalTypeInfo) type; + int precision = dtype.precision(); + int scale = dtype.scale(); + HiveDecimal hd = in.readDecimal(scale, getDecimalByteNum(precision)); + if (isNull) { + return null; + } else { + HiveDecimalWritable r = (reuse == null ? new HiveDecimalWritable() : (HiveDecimalWritable) reuse); + r.set(hd); + return r; + } + case SHORT: // Teradata Type: SMALLINT + short s = in.readShort(); + if (isNull) { + return null; + } else { + ShortWritable r = reuse == null ? new ShortWritable() : (ShortWritable) reuse; + r.set(s); + return r; + } + case BINARY: // Teradata Type: VARBYTE + byte[] content = in.readVarbyte(); + if (isNull) { + return null; + } else { + BytesWritable r = new BytesWritable(); + r.set(content, 0, content.length); + return r; + } + default: + throw new SerDeException("Unrecognized type: " + ptype.getPrimitiveCategory()); + } + // Currently, deserialization of complex types is not supported + case LIST: + case MAP: + case STRUCT: + default: + throw new SerDeException("Unsupported category: " + type.getCategory()); + } + } + + /** + * Get the object inspector that can be used to navigate through the internal + * structure of the Object returned from deserialize(...). + */ + @Override public ObjectInspector getObjectInspector() throws SerDeException { + return rowOI; + } + + private int getTimeStampByteNum(int precision) { + if (precision == 0) { + return DEFAULT_TIMESTAMP_BYTE_NUM; + } else { + return precision + 1 + DEFAULT_TIMESTAMP_BYTE_NUM; + } + } + + private int getCharByteNum(String charset) throws SerDeException { + if (!CHARSET_TO_BYTE_NUM.containsKey(charCharset)) { + throw new SerDeException( + format("%s isn't supported in Teradata Char Charset %s", charCharset, CHARSET_TO_BYTE_NUM.keySet())); + } else { + return CHARSET_TO_BYTE_NUM.get(charset); + } + } + + private int getDecimalByteNum(int precision) throws SerDeException { + if (precision <= 0) { + throw new SerDeException(format("the precision of Decimal should be bigger than 0. %d is illegal", precision)); + } + if (precision <= 2) { + return 1; + } + if (precision <= 4) { + return 2; + } + if (precision <= 9) { + return 4; + } + if (precision <= 18) { + return 8; + } + if (precision <= 38) { + return 16; + } + throw new IllegalArgumentException( + format("the precision of Decimal should be smaller than 39. %d is illegal", precision)); + } +} diff --git serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDate.java serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDate.java new file mode 100644 index 0000000000..af81fe30c8 --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDate.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.teradata; + +import com.google.common.io.BaseEncoding; +import junit.framework.TestCase; +import org.apache.hadoop.hive.common.type.Date; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.io.DateWritableV2; +import org.apache.hadoop.io.BytesWritable; +import org.junit.Assert; + +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +/** + * Test the data type DATE for Teradata binary format. + */ +public class TestTeradataBinarySerdeForDate extends TestCase { + + private final TeradataBinarySerde serde = new TeradataBinarySerde(); + private final Properties props = new Properties(); + + protected void setUp() throws Exception { + props.setProperty(serdeConstants.LIST_COLUMNS, "TD_DATE"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "date"); + serde.initialize(null, props); + } + + public void testTimestampBefore1900() throws Exception { + + //0060-01-01 + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("00653de7fe")); + + List row = (List) serde.deserialize(in); + Date ts = ((DateWritableV2) row.get(0)).get(); + Assert.assertEquals(ts.getYear(), 60); + Assert.assertEquals(ts.getMonth(), 1); + Assert.assertEquals(ts.getDay(), 1); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testTimestampAfter1900() throws Exception { + + //9999-01-01 + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("0095cfd304")); + + List row = (List) serde.deserialize(in); + Date ts = ((DateWritableV2) row.get(0)).get(); + Assert.assertEquals(ts.getYear(), 9999); + Assert.assertEquals(ts.getMonth(), 1); + Assert.assertEquals(ts.getDay(), 1); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } +} diff --git serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDecimal.java serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDecimal.java new file mode 100644 index 0000000000..6abdd3f722 --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForDecimal.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.teradata; + +import com.google.common.io.BaseEncoding; +import junit.framework.TestCase; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.io.BytesWritable; +import org.junit.Assert; + +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +/** + * Test the data type DECIMAL for Teradata binary format. + */ +public class TestTeradataBinarySerdeForDecimal extends TestCase { + + private final TeradataBinarySerde serde = new TeradataBinarySerde(); + private final Properties props = new Properties(); + + protected void setUp() throws Exception { + props.setProperty(serdeConstants.LIST_COLUMNS, "TD_DECIMAL"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "decimal(9,5)"); + + serde.initialize(null, props); + } + + public void testPositiveFraction() throws Exception { + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("0064000000")); + + List row = (List) serde.deserialize(in); + Assert.assertTrue("0.001".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString())); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testNegativeFraction() throws Exception { + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("009cffffff")); + + List row = (List) serde.deserialize(in); + Assert.assertTrue("-0.001".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString())); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testPositiveNumber1() throws Exception { + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("00a0860100")); + + List row = (List) serde.deserialize(in); + Assert.assertTrue("1".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString())); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testNegativeNumber1() throws Exception { + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("006079feff")); + + List row = (List) serde.deserialize(in); + Assert.assertTrue("-1".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString())); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testPositiveNumber2() throws Exception { + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("0080969800")); + + List row = (List) serde.deserialize(in); + Assert.assertTrue("100".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString())); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testNegativeNumber2() throws Exception { + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode("000065c4e0")); + + List row = (List) serde.deserialize(in); + Assert.assertTrue("-5240".equals(((HiveDecimalWritable) row.get(0)).getHiveDecimal().toString())); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } +} diff --git serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForTimeStamp.java serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForTimeStamp.java new file mode 100644 index 0000000000..a6cf2c14e9 --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeForTimeStamp.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.teradata; + +import com.google.common.io.BaseEncoding; +import junit.framework.TestCase; +import org.apache.hadoop.hive.common.type.Timestamp; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; +import org.apache.hadoop.io.BytesWritable; +import org.junit.Assert; + +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +/** + * Test the data type TIMESTAMP for Teradata binary format. + */ +public class TestTeradataBinarySerdeForTimeStamp extends TestCase { + + private final TeradataBinarySerde serde = new TeradataBinarySerde(); + private final Properties props = new Properties(); + + protected void setUp() throws Exception { + props.setProperty(serdeConstants.LIST_COLUMNS, "TD_TIMESTAMP"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, "timestamp"); + } + + public void testTimestampPrecision6() throws Exception { + props.setProperty(TeradataBinarySerde.TD_TIMESTAMP_PRECISION, "6"); + serde.initialize(null, props); + + //2012-10-01 12:00:00.110000 + BytesWritable in = new BytesWritable( + BaseEncoding.base16().lowerCase().decode("00323031322d31302d30312031323a30303a30302e313130303030")); + + List row = (List) serde.deserialize(in); + Timestamp ts = ((TimestampWritableV2) row.get(0)).getTimestamp(); + Assert.assertEquals(ts.getYear(), 2012); + Assert.assertEquals(ts.getMonth(), 10); + Assert.assertEquals(ts.getDay(), 1); + Assert.assertEquals(ts.getHours(), 12); + Assert.assertEquals(ts.getMinutes(), 0); + Assert.assertEquals(ts.getSeconds(), 0); + Assert.assertEquals(ts.getNanos(), 110000000); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testTimestampPrecision0() throws Exception { + props.setProperty(TeradataBinarySerde.TD_TIMESTAMP_PRECISION, "0"); + serde.initialize(null, props); + + //2012-10-01 12:00:00 + BytesWritable in = + new BytesWritable(BaseEncoding.base16().lowerCase().decode("00323031322d31302d30312031323a30303a3030")); + + List row = (List) serde.deserialize(in); + Timestamp ts = ((TimestampWritableV2) row.get(0)).getTimestamp(); + Assert.assertEquals(ts.getYear(), 2012); + Assert.assertEquals(ts.getMonth(), 10); + Assert.assertEquals(ts.getDay(), 1); + Assert.assertEquals(ts.getHours(), 12); + Assert.assertEquals(ts.getMinutes(), 0); + Assert.assertEquals(ts.getSeconds(), 0); + Assert.assertEquals(ts.getNanos(), 0); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testTimestampPrecision3() throws Exception { + props.setProperty(TeradataBinarySerde.TD_TIMESTAMP_PRECISION, "3"); + serde.initialize(null, props); + + //2012-10-01 12:00:00.345 + BytesWritable in = + new BytesWritable(BaseEncoding.base16().lowerCase().decode("00323031322d31302d30312031323a30303a30302e333435")); + + List row = (List) serde.deserialize(in); + Timestamp ts = ((TimestampWritableV2) row.get(0)).getTimestamp(); + Assert.assertEquals(ts.getYear(), 2012); + Assert.assertEquals(ts.getMonth(), 10); + Assert.assertEquals(ts.getDay(), 1); + Assert.assertEquals(ts.getHours(), 12); + Assert.assertEquals(ts.getMinutes(), 0); + Assert.assertEquals(ts.getSeconds(), 0); + Assert.assertEquals(ts.getNanos(), 345000000); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } +} diff --git serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeGeneral.java serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeGeneral.java new file mode 100644 index 0000000000..c50ef7082d --- /dev/null +++ serde/src/test/org/apache/hadoop/hive/serde2/teradata/TestTeradataBinarySerdeGeneral.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.teradata; + +import com.google.common.io.BaseEncoding; +import junit.framework.TestCase; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DateWritableV2; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritableV2; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.junit.Assert; + +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +/** + * Test all the data types supported for Teradata Binary Format. + */ +public class TestTeradataBinarySerdeGeneral extends TestCase { + + private final TeradataBinarySerde serde = new TeradataBinarySerde(); + private final Properties props = new Properties(); + + protected void setUp() throws Exception { + props.setProperty(serdeConstants.LIST_COLUMNS, + "TD_CHAR, TD_VARCHAR, TD_BIGINT, TD_INT, TD_SMALLINT, TD_BYTEINT, " + + "TD_FLOAT,TD_DECIMAL,TD_DATE, TD_TIMESTAMP, TD_VARBYTE"); + props.setProperty(serdeConstants.LIST_COLUMN_TYPES, + "char(3),varchar(100),bigint,int,smallint,tinyint,double,decimal(31,30),date,timestamp,binary"); + + serde.initialize(null, props); + } + + public void testDeserializeAndSerialize() throws Exception { + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode( + "00004e6f762020202020201b006120646179203d2031312f31312f31312020202020202020203435ec10000000000000c5feffff" + + "7707010000000000002a40ef2b3dab0d14e6531c8908a72700000007b20100313931312d31312d31312031393a32303a32312e34" + + "33333230301b00746573743a20202020202020343333322020202020202020333135")); + + List row = (List) serde.deserialize(in); + Assert.assertEquals("Nov", ((HiveCharWritable) row.get(0)).toString()); + Assert.assertEquals("a day = 11/11/11 45", ((HiveVarcharWritable) row.get(1)).toString()); + Assert.assertEquals(4332L, ((LongWritable) row.get(2)).get()); + Assert.assertEquals(-315, ((IntWritable) row.get(3)).get()); + Assert.assertEquals((short) 1911, ((ShortWritable) row.get(4)).get()); + Assert.assertEquals((byte) 1, ((ByteWritable) row.get(5)).get()); + Assert.assertEquals((double) 13, ((DoubleWritable) row.get(6)).get(), 0); + Assert.assertEquals(30, ((HiveDecimalWritable) row.get(7)).getScale()); + Assert.assertEquals((double) 3.141592653589793238462643383279, + ((HiveDecimalWritable) row.get(7)).getHiveDecimal().doubleValue(), 0); + Assert.assertEquals("1911-11-11", ((DateWritableV2) row.get(8)).toString()); + Assert.assertEquals("1911-11-11 19:20:21.4332", ((TimestampWritableV2) row.get(9)).toString()); + Assert.assertEquals(27, ((BytesWritable) row.get(10)).getLength()); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testDeserializeAndSerializeWithNull() throws Exception { + //null bitmap: 0160 -> 00000001 01100000, 7th, 9th, 10th is null + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode( + "01604d61722020202020201b006120646179203d2031332f30332f303820202020202020202020397ca10000000000004300000" + + "0dd0700000000000048834000000000000000000000000000000000443f110020202020202020202020202020202020202020202" + + "020202020200000")); + List row = (List) serde.deserialize(in); + + Assert.assertEquals("Mar", ((HiveCharWritable) row.get(0)).toString()); + Assert.assertEquals(null, row.get(7)); + Assert.assertEquals(null, row.get(9)); + Assert.assertEquals(null, row.get(10)); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testDeserializeAndSerializeAllNull() throws Exception { + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode( + "ffe0202020202020202020000000000000000000000000000000000000000000000000000000000000000000000000000000000" + + "00000000020202020202020202020202020202020202020202020202020200000")); + List row = (List) serde.deserialize(in); + + Assert.assertEquals(null, row.get(0)); + Assert.assertEquals(null, row.get(1)); + Assert.assertEquals(null, row.get(3)); + Assert.assertEquals(null, row.get(4)); + Assert.assertEquals(null, row.get(5)); + Assert.assertEquals(null, row.get(6)); + Assert.assertEquals(null, row.get(7)); + Assert.assertEquals(null, row.get(8)); + Assert.assertEquals(null, row.get(9)); + Assert.assertEquals(null, row.get(10)); + + BytesWritable res = (BytesWritable) serde.serialize(row, serde.getObjectInspector()); + Assert.assertTrue(Arrays.equals(in.copyBytes(), res.copyBytes())); + } + + public void testDeserializeCorruptedRecord() throws Exception { + BytesWritable in = new BytesWritable(BaseEncoding.base16().lowerCase().decode( + "00004e6f762020202020201b006120646179203d2031312f31312f31312020202020202020203435ec10000000000000c5feff" + + "ff7707010000000000002a40ef2b3dab0d14e6531c8908a72700000007b20100313931312d31312d31312031393a32303a32312" + + "e3433333230301b00746573743a20202020202020343333322020202020202020333135ff")); + + List row = (List) serde.deserialize(in); + Assert.assertEquals(null, row.get(0)); + Assert.assertEquals(null, row.get(3)); + Assert.assertEquals(null, row.get(10)); + } +}