Index: serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableStringAscSerDe.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableStringAscSerDe.java (revision 0) +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableStringAscSerDe.java (revision 0) @@ -0,0 +1,173 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.binarysortable; + +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.Constants; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +/** + * BinarySortableStringAscSerDe can be used to write data in a way that the data can be + * compared byte-by-byte with the same order. This SerDe requires data is one single + * string type and the order is asc. + * + * The data format: NULL: a byte array with length = 0; empty string: a byte array with + * a single 0; normal Text starting with 0: 0 + bytes of the Text; others: the bytes of + * the Text. In this way, we preserve the order, avoid another mem-copy except the first + * byte is 0 and outputs are relatively short. + * + */ +public class BinarySortableStringAscSerDe implements SerDe { + + public static final Log LOG = LogFactory.getLog(BinarySortableStringAscSerDe.class + .getName()); + + List columnNames; + List columnTypes; + + TypeInfo rowTypeInfo; + StructObjectInspector rowObjectInspector; + + boolean[] columnSortOrderIsDesc; + + @Override + public void initialize(Configuration conf, Properties tbl) + throws SerDeException { + + byte[] nullBuffer = new byte[0]; + nullBytesWritable.set(nullBuffer, 0, 0); + outBuffer[0] = (byte) 0; + + // Get column names and sort order + String columnNameProperty = tbl.getProperty(Constants.LIST_COLUMNS); + String columnTypeProperty = tbl.getProperty(Constants.LIST_COLUMN_TYPES); + + assert (columnNames.size() == 1); + assert (columnNames.size() == columnTypes.size()); + + columnNames = new ArrayList(1); + columnNames.add(columnNameProperty); + + columnTypes = TypeInfoUtils + .getTypeInfosFromTypeString(columnTypeProperty); + + + // Create row related objects + rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + rowObjectInspector = (StructObjectInspector) TypeInfoUtils + .getStandardWritableObjectInspectorFromTypeInfo(rowTypeInfo); + row = new ArrayList(1); + row.add(null); + } + + @Override + public Class getSerializedClass() { + return BytesWritable.class; + } + + @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return rowObjectInspector; + } + + ArrayList row; + Text reuseText = new Text(); + + @Override + public Object deserialize(Writable blob) throws SerDeException { + BytesWritable data = (BytesWritable) blob; + + byte[] bytes = data.getBytes(); + int length = data.getLength(); + + // Looks like MapReduce framework added a 0xFF to the end. + + if (data.getLength() == 0) { + row.set(0, null); + } else { + if (bytes[0] == 0) { + reuseText.set(bytes, 1, length - 1); + } else { + reuseText.set(bytes, 0, length); + } + row.set(0, reuseText); + } + return row; + } + + byte[] outBuffer = new byte[128]; + BytesWritable serializeBytesWritable = new BytesWritable(); + BytesWritable nullBytesWritable = new BytesWritable(); + + @Override + public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { + StructObjectInspector soi = (StructObjectInspector) objInspector; + List fields = soi.getAllStructFieldRefs(); + + assert (fields.size() == 1); + + Object data = soi.getStructFieldData(obj, fields.get(0)); + + int keyLength; + if (data == null) { + return nullBytesWritable; + } else { + ObjectInspector oi = fields.get(0).getFieldObjectInspector(); + assert(oi instanceof StringObjectInspector); + StringObjectInspector toi = (StringObjectInspector) oi; + Text key = toi.getPrimitiveWritableObject(data); + + keyLength = key.getLength(); + byte[] keyBytes = key.getBytes(); + if (keyLength == 0 || key.getBytes()[0] == (byte) 0) { + if (outBuffer.length < keyLength + 1) { + int newBufferSize = outBuffer.length * 2; + while (newBufferSize < key.getLength() + 1) { + newBufferSize *= 2; + } + outBuffer = new byte[newBufferSize]; + } + outBuffer[0] = 0; + System.arraycopy(key.getBytes(), 0, outBuffer, 1, keyLength); + serializeBytesWritable.set(outBuffer, 0, keyLength + 1); + } else { + serializeBytesWritable.set(keyBytes, 0, keyLength); + } + } + + return serializeBytesWritable; + } +} Index: serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableLongAscSerDe.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableLongAscSerDe.java (revision 0) +++ serde/src/java/org/apache/hadoop/hive/serde2/binarysortable/BinarySortableLongAscSerDe.java (revision 0) @@ -0,0 +1,162 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.binarysortable; + +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde.Constants; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; + +/** + * BinarySortableLongAscSerDe can be used to write data in a way that the data can be + * compared byte-by-byte with the same order. This SerDe requires data is one single + * long type and the order is asc. + * + * The data format: NULL: a byte array with length = 0; Others: 8 bytes of the long + * object with the signal bit flip. + * + */ +public class BinarySortableLongAscSerDe implements SerDe { + + public static final Log LOG = LogFactory.getLog(BinarySortableLongAscSerDe.class + .getName()); + + List columnNames; + List columnTypes; + + TypeInfo rowTypeInfo; + StructObjectInspector rowObjectInspector; + + boolean[] columnSortOrderIsDesc; + + @Override + public void initialize(Configuration conf, Properties tbl) + throws SerDeException { + + byte[] nullBuffer = new byte[0]; + nullBytesWritable.set(nullBuffer, 0, 0); + outBuffer[0] = (byte) 0; + + // Get column names and sort order + String columnNameProperty = tbl.getProperty(Constants.LIST_COLUMNS); + String columnTypeProperty = tbl.getProperty(Constants.LIST_COLUMN_TYPES); + + assert (columnNames.size() == 1); + assert (columnNames.size() == columnTypes.size()); + + columnNames = new ArrayList(1); + columnNames.add(columnNameProperty); + + columnTypes = TypeInfoUtils + .getTypeInfosFromTypeString(columnTypeProperty); + + + // Create row related objects + rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); + rowObjectInspector = (StructObjectInspector) TypeInfoUtils + .getStandardWritableObjectInspectorFromTypeInfo(rowTypeInfo); + row = new ArrayList(1); + row.add(null); + } + + @Override + public Class getSerializedClass() { + return BytesWritable.class; + } + + @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return rowObjectInspector; + } + + ArrayList row; + LongWritable reuseLong = new LongWritable(); + + @Override + public Object deserialize(Writable blob) throws SerDeException { + BytesWritable data = (BytesWritable) blob; + + byte[] bytes = data.getBytes(); + int length = data.getLength(); + + if (length == 0) { + row.set(0, null); + } else { + long v = bytes[0] ^ 0x80; + for (int i = 1; i < 8; i++) { + v = (v << 8) + (bytes[i] & 0xff); + } + reuseLong.set(v); + row.set(0, reuseLong); + } + return row; + } + + byte[] outBuffer = new byte[8]; + BytesWritable serializeBytesWritable = new BytesWritable(); + BytesWritable nullBytesWritable = new BytesWritable(); + + @Override + public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { + StructObjectInspector soi = (StructObjectInspector) objInspector; + List fields = soi.getAllStructFieldRefs(); + + assert (fields.size() == 1); + + Object data = soi.getStructFieldData(obj, fields.get(0)); + + if (data == null) { + return nullBytesWritable; + } else { + ObjectInspector oi = fields.get(0).getFieldObjectInspector(); + assert(oi instanceof LongObjectInspector); + LongObjectInspector loi = (LongObjectInspector) oi; + long v = loi.get(data); + + outBuffer[0] = (byte) ((v >> 56) ^ 0x80); + outBuffer[1] = (byte) (v >> 48); + outBuffer[2] = (byte) (v >> 40); + outBuffer[3] = (byte) (v >> 32); + outBuffer[4] = (byte) (v >> 24); + outBuffer[5] = (byte) (v >> 16); + outBuffer[6] = (byte) (v >> 8); + outBuffer[7] = (byte) v; + + serializeBytesWritable.set(outBuffer, 0, 8); + } + + return serializeBytesWritable; + } +} Index: ql/src/test/results/clientpositive/join_bigint.q.out =================================================================== --- ql/src/test/results/clientpositive/join_bigint.q.out (revision 0) +++ ql/src/test/results/clientpositive/join_bigint.q.out (revision 0) @@ -0,0 +1,374 @@ +PREHOOK: query: drop table join_long_1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table join_long_1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: CREATE TABLE join_long_1(key1 bigint, key2 bigint, value string) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE join_long_1(key1 bigint, key2 bigint, value string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@join_long_1 +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/in7.txt' INTO TABLE join_long_1 +PREHOOK: type: LOAD +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/in7.txt' INTO TABLE join_long_1 +POSTHOOK: type: LOAD +POSTHOOK: Output: default@join_long_1 +PREHOOK: query: SELECT key1, count(key1), count(1), avg(key1), sum(key2) from join_long_1 group by key1 +PREHOOK: type: QUERY +PREHOOK: Input: default@join_long_1 +PREHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-44_080_6687839933448529793/-mr-10000 +POSTHOOK: query: SELECT key1, count(key1), count(1), avg(key1), sum(key2) from join_long_1 group by key1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@join_long_1 +POSTHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-44_080_6687839933448529793/-mr-10000 +NULL 0 3 NULL 100 +-8000000000 2 2 -8.0E9 8000000040 +-7000000000 2 2 -7.0E9 8000000040 +10 1 1 10.0 10 +30 1 1 30.0 30 +40 2 2 40.0 80 +50 3 3 50.0 150 +60 2 2 60.0 80 +80 2 2 80.0 80 +7000000000 2 2 7.0E9 80 +PREHOOK: query: SELECT key1, key2, count(1) from join_long_1 group by key1, key2 +PREHOOK: type: QUERY +PREHOOK: Input: default@join_long_1 +PREHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-47_084_5970249462607701546/-mr-10000 +POSTHOOK: query: SELECT key1, key2, count(1) from join_long_1 group by key1, key2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@join_long_1 +POSTHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-47_084_5970249462607701546/-mr-10000 +NULL NULL 1 +NULL 50 2 +-8000000000 40 1 +-8000000000 8000000000 1 +-7000000000 40 1 +-7000000000 8000000000 1 +10 10 1 +30 30 1 +40 40 2 +50 50 3 +60 40 2 +80 40 2 +7000000000 40 2 +PREHOOK: query: SELECT * FROM join_long_1 a join join_long_1 b on a.key1 = b.key1 +PREHOOK: type: QUERY +PREHOOK: Input: default@join_long_1 +PREHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-49_704_2423888710872826499/-mr-10000 +POSTHOOK: query: SELECT * FROM join_long_1 a join join_long_1 b on a.key1 = b.key1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@join_long_1 +POSTHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-49_704_2423888710872826499/-mr-10000 +-8000000000 40 88 -8000000000 40 88 +-8000000000 40 88 -8000000000 8000000000 88 +-8000000000 8000000000 88 -8000000000 40 88 +-8000000000 8000000000 88 -8000000000 8000000000 88 +-7000000000 40 88 -7000000000 40 88 +-7000000000 40 88 -7000000000 8000000000 88 +-7000000000 8000000000 88 -7000000000 40 88 +-7000000000 8000000000 88 -7000000000 8000000000 88 +10 10 66 10 10 66 +30 30 88 30 30 88 +40 40 88 40 40 88 +40 40 88 40 40 66 +40 40 66 40 40 88 +40 40 66 40 40 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 66 50 50 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 66 50 50 66 +50 50 88 50 50 88 +50 50 88 50 50 66 +50 50 88 50 50 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +PREHOOK: query: SELECT * FROM join_long_1 a full outer join join_long_1 b on a.key1 = b.key1 +PREHOOK: type: QUERY +PREHOOK: Input: default@join_long_1 +PREHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-52_404_2538499380194485415/-mr-10000 +POSTHOOK: query: SELECT * FROM join_long_1 a full outer join join_long_1 b on a.key1 = b.key1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@join_long_1 +POSTHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-52_404_2538499380194485415/-mr-10000 +NULL NULL 66 NULL NULL NULL +NULL 50 66 NULL NULL NULL +NULL 50 66 NULL NULL NULL +NULL NULL NULL NULL NULL 66 +NULL NULL NULL NULL 50 66 +NULL NULL NULL NULL 50 66 +-8000000000 40 88 -8000000000 40 88 +-8000000000 40 88 -8000000000 8000000000 88 +-8000000000 8000000000 88 -8000000000 40 88 +-8000000000 8000000000 88 -8000000000 8000000000 88 +-7000000000 40 88 -7000000000 40 88 +-7000000000 40 88 -7000000000 8000000000 88 +-7000000000 8000000000 88 -7000000000 40 88 +-7000000000 8000000000 88 -7000000000 8000000000 88 +10 10 66 10 10 66 +30 30 88 30 30 88 +40 40 88 40 40 88 +40 40 88 40 40 66 +40 40 66 40 40 88 +40 40 66 40 40 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 66 50 50 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 66 50 50 66 +50 50 88 50 50 88 +50 50 88 50 50 66 +50 50 88 50 50 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +PREHOOK: query: SELECT * FROM join_long_1 a left outer join join_long_1 b on a.key1 = b.key1 +PREHOOK: type: QUERY +PREHOOK: Input: default@join_long_1 +PREHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-55_959_736506743560899851/-mr-10000 +POSTHOOK: query: SELECT * FROM join_long_1 a left outer join join_long_1 b on a.key1 = b.key1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@join_long_1 +POSTHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-55_959_736506743560899851/-mr-10000 +NULL NULL 66 NULL NULL NULL +NULL 50 66 NULL NULL NULL +NULL 50 66 NULL NULL NULL +-8000000000 40 88 -8000000000 40 88 +-8000000000 40 88 -8000000000 8000000000 88 +-8000000000 8000000000 88 -8000000000 40 88 +-8000000000 8000000000 88 -8000000000 8000000000 88 +-7000000000 40 88 -7000000000 40 88 +-7000000000 40 88 -7000000000 8000000000 88 +-7000000000 8000000000 88 -7000000000 40 88 +-7000000000 8000000000 88 -7000000000 8000000000 88 +10 10 66 10 10 66 +30 30 88 30 30 88 +40 40 88 40 40 88 +40 40 88 40 40 66 +40 40 66 40 40 88 +40 40 66 40 40 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 66 50 50 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 66 50 50 66 +50 50 88 50 50 88 +50 50 88 50 50 66 +50 50 88 50 50 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +PREHOOK: query: SELECT * FROM join_long_1 a right outer join join_long_1 b on a.key1 = b.key1 +PREHOOK: type: QUERY +PREHOOK: Input: default@join_long_1 +PREHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-58_608_5841980130753701529/-mr-10000 +POSTHOOK: query: SELECT * FROM join_long_1 a right outer join join_long_1 b on a.key1 = b.key1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@join_long_1 +POSTHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-19-58_608_5841980130753701529/-mr-10000 +NULL NULL NULL NULL NULL 66 +NULL NULL NULL NULL 50 66 +NULL NULL NULL NULL 50 66 +-8000000000 40 88 -8000000000 40 88 +-8000000000 40 88 -8000000000 8000000000 88 +-8000000000 8000000000 88 -8000000000 40 88 +-8000000000 8000000000 88 -8000000000 8000000000 88 +-7000000000 40 88 -7000000000 40 88 +-7000000000 40 88 -7000000000 8000000000 88 +-7000000000 8000000000 88 -7000000000 40 88 +-7000000000 8000000000 88 -7000000000 8000000000 88 +10 10 66 10 10 66 +30 30 88 30 30 88 +40 40 88 40 40 88 +40 40 88 40 40 66 +40 40 66 40 40 88 +40 40 66 40 40 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 66 50 50 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 66 50 50 66 +50 50 88 50 50 88 +50 50 88 50 50 66 +50 50 88 50 50 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +60 40 66 60 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +80 40 66 80 40 66 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +7000000000 40 88 7000000000 40 88 +PREHOOK: query: SELECT * FROM join_long_1 a join join_long_1 b on a.key1 = b.key2 +PREHOOK: type: QUERY +PREHOOK: Input: default@join_long_1 +PREHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-20-01_182_260018714794157578/-mr-10000 +POSTHOOK: query: SELECT * FROM join_long_1 a join join_long_1 b on a.key1 = b.key2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@join_long_1 +POSTHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-20-01_182_260018714794157578/-mr-10000 +10 10 66 10 10 66 +30 30 88 30 30 88 +40 40 66 40 40 88 +40 40 66 40 40 66 +40 40 66 60 40 66 +40 40 66 -7000000000 40 88 +40 40 66 -8000000000 40 88 +40 40 66 7000000000 40 88 +40 40 66 7000000000 40 88 +40 40 66 80 40 66 +40 40 66 80 40 66 +40 40 66 60 40 66 +40 40 88 40 40 88 +40 40 88 40 40 66 +40 40 88 60 40 66 +40 40 88 -7000000000 40 88 +40 40 88 -8000000000 40 88 +40 40 88 7000000000 40 88 +40 40 88 7000000000 40 88 +40 40 88 80 40 66 +40 40 88 80 40 66 +40 40 88 60 40 66 +50 50 66 NULL 50 66 +50 50 66 50 50 66 +50 50 66 NULL 50 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 88 NULL 50 66 +50 50 88 50 50 66 +50 50 88 NULL 50 66 +50 50 88 50 50 88 +50 50 88 50 50 66 +50 50 66 NULL 50 66 +50 50 66 50 50 66 +50 50 66 NULL 50 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +PREHOOK: query: SELECT * FROM join_long_1 a full outer join join_long_1 b on a.key1 = b.key2 +PREHOOK: type: QUERY +PREHOOK: Input: default@join_long_1 +PREHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-20-10_361_8104781671953740368/-mr-10000 +POSTHOOK: query: SELECT * FROM join_long_1 a full outer join join_long_1 b on a.key1 = b.key2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@join_long_1 +POSTHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-20-10_361_8104781671953740368/-mr-10000 +NULL NULL 66 NULL NULL NULL +NULL 50 66 NULL NULL NULL +NULL 50 66 NULL NULL NULL +NULL NULL NULL NULL NULL 66 +-8000000000 40 88 NULL NULL NULL +-8000000000 8000000000 88 NULL NULL NULL +-7000000000 8000000000 88 NULL NULL NULL +-7000000000 40 88 NULL NULL NULL +10 10 66 10 10 66 +30 30 88 30 30 88 +40 40 66 40 40 88 +40 40 66 40 40 66 +40 40 66 60 40 66 +40 40 66 -7000000000 40 88 +40 40 66 -8000000000 40 88 +40 40 66 7000000000 40 88 +40 40 66 7000000000 40 88 +40 40 66 80 40 66 +40 40 66 80 40 66 +40 40 66 60 40 66 +40 40 88 40 40 88 +40 40 88 40 40 66 +40 40 88 60 40 66 +40 40 88 -7000000000 40 88 +40 40 88 -8000000000 40 88 +40 40 88 7000000000 40 88 +40 40 88 7000000000 40 88 +40 40 88 80 40 66 +40 40 88 80 40 66 +40 40 88 60 40 66 +50 50 66 NULL 50 66 +50 50 66 50 50 66 +50 50 66 NULL 50 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 88 NULL 50 66 +50 50 88 50 50 66 +50 50 88 NULL 50 66 +50 50 88 50 50 88 +50 50 88 50 50 66 +50 50 66 NULL 50 66 +50 50 66 50 50 66 +50 50 66 NULL 50 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +60 40 66 NULL NULL NULL +60 40 66 NULL NULL NULL +80 40 66 NULL NULL NULL +80 40 66 NULL NULL NULL +7000000000 40 88 NULL NULL NULL +7000000000 40 88 NULL NULL NULL +NULL NULL NULL -8000000000 8000000000 88 +NULL NULL NULL -7000000000 8000000000 88 +PREHOOK: query: SELECT * FROM join_long_1 a join join_long_1 b on a.key1 = b.key2 and b.key1 = b.key2 +PREHOOK: type: QUERY +PREHOOK: Input: default@join_long_1 +PREHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-20-13_000_3073460877272633086/-mr-10000 +POSTHOOK: query: SELECT * FROM join_long_1 a join join_long_1 b on a.key1 = b.key2 and b.key1 = b.key2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@join_long_1 +POSTHOOK: Output: file:/tmp/sdong/hive_2010-11-19_12-20-13_000_3073460877272633086/-mr-10000 +10 10 66 10 10 66 +30 30 88 30 30 88 +40 40 66 40 40 88 +40 40 66 40 40 66 +40 40 88 40 40 88 +40 40 88 40 40 66 +50 50 66 50 50 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 66 50 50 66 +50 50 66 50 50 88 +50 50 66 50 50 66 +50 50 88 50 50 66 +50 50 88 50 50 88 +50 50 88 50 50 66 +PREHOOK: query: drop table join_long_1 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@join_long_1 +PREHOOK: Output: default@join_long_1 +POSTHOOK: query: drop table join_long_1 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@join_long_1 +POSTHOOK: Output: default@join_long_1 Index: ql/src/test/queries/clientpositive/join_bigint.q =================================================================== --- ql/src/test/queries/clientpositive/join_bigint.q (revision 0) +++ ql/src/test/queries/clientpositive/join_bigint.q (revision 0) @@ -0,0 +1,20 @@ +drop table join_long_1; + +CREATE TABLE join_long_1(key1 bigint, key2 bigint, value string); +LOAD DATA LOCAL INPATH '../data/files/in7.txt' INTO TABLE join_long_1; + +SELECT key1, count(key1), count(1), avg(key1), sum(key2) from join_long_1 group by key1; +SELECT key1, key2, count(1) from join_long_1 group by key1, key2; + + +SELECT * FROM join_long_1 a join join_long_1 b on a.key1 = b.key1; +SELECT * FROM join_long_1 a full outer join join_long_1 b on a.key1 = b.key1; +SELECT * FROM join_long_1 a left outer join join_long_1 b on a.key1 = b.key1; +SELECT * FROM join_long_1 a right outer join join_long_1 b on a.key1 = b.key1; + +SELECT * FROM join_long_1 a join join_long_1 b on a.key1 = b.key2; +SELECT * FROM join_long_1 a full outer join join_long_1 b on a.key1 = b.key2; + +SELECT * FROM join_long_1 a join join_long_1 b on a.key1 = b.key2 and b.key1 = b.key2; + +drop table join_long_1; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java (revision 10359) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ReduceSinkOperator.java (working copy) @@ -76,7 +76,7 @@ transient Serializer keySerializer; transient boolean keyIsText; transient Serializer valueSerializer; - transient int tag; + transient int encodeTag; transient byte[] tagByte = new byte[1]; transient protected int numDistributionKeys; transient protected int numDistinctExprs; @@ -107,9 +107,9 @@ partitionEval[i++] = ExprNodeEvaluatorFactory.get(e); } - tag = conf.getTag(); - tagByte[0] = (byte) tag; - LOG.info("Using tag = " + tag); + encodeTag = conf.getTag(); + tagByte[0] = (byte) encodeTag; + LOG.info("Using tag = " + encodeTag); TableDesc keyTableDesc = conf.getKeySerializeInfo(); keySerializer = (Serializer) keyTableDesc.getDeserializerClass() @@ -263,7 +263,7 @@ if (keyIsText) { Text key = (Text) keySerializer.serialize(cachedKeys[i], keyObjectInspector); - if (tag == -1) { + if (this.encodeTag == -1) { keyWritable.set(key.getBytes(), 0, key.getLength()); } else { int keyLength = key.getLength(); @@ -275,7 +275,7 @@ // Must be BytesWritable BytesWritable key = (BytesWritable) keySerializer.serialize( cachedKeys[i], keyObjectInspector); - if (tag == -1) { + if (this.encodeTag == -1) { keyWritable.set(key.getBytes(), 0, key.getLength()); } else { int keyLength = key.getLength(); Index: ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java (revision 10359) +++ ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java (working copy) @@ -50,7 +50,9 @@ import org.apache.hadoop.hive.serde2.DelimitedJSONSerDe; import org.apache.hadoop.hive.serde2.Deserializer; import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe; +import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableLongAscSerDe; import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe; +import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableStringAscSerDe; import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe; @@ -77,6 +79,10 @@ FIELD, JEXL }; + public static enum SortableSerdeType { + GENERAL, SINGLE_ASC_STRING, SINGLE_ASC_LONG + } + @SuppressWarnings("nls") public static MapredWork getMapRedWork() { try { @@ -294,15 +300,39 @@ /** * Generate the table descriptor for reduce key. */ +// public static TableDesc getReduceKeyTableDesc(List fieldSchemas, +// String order) { +// return getReduceKeyTableDesc(fieldSchemas, order, SortableSerdeType.GENERAL); +// } + + /** + * Generate the table descriptor for reduce key. + * @throws SemanticException + */ public static TableDesc getReduceKeyTableDesc(List fieldSchemas, - String order) { - return new TableDesc(BinarySortableSerDe.class, - SequenceFileInputFormat.class, SequenceFileOutputFormat.class, - Utilities.makeProperties(Constants.LIST_COLUMNS, MetaStoreUtils + String order, SortableSerdeType serdeType) { + Properties properties = Utilities.makeProperties(Constants.LIST_COLUMNS, MetaStoreUtils .getColumnNamesFromFieldSchema(fieldSchemas), Constants.LIST_COLUMN_TYPES, MetaStoreUtils .getColumnTypesFromFieldSchema(fieldSchemas), - Constants.SERIALIZATION_SORT_ORDER, order)); + Constants.SERIALIZATION_SORT_ORDER, order); + switch (serdeType) { + case GENERAL: + return new TableDesc(BinarySortableSerDe.class, + SequenceFileInputFormat.class, SequenceFileOutputFormat.class, + properties); + case SINGLE_ASC_STRING: + return new TableDesc(BinarySortableStringAscSerDe.class, + SequenceFileInputFormat.class, SequenceFileOutputFormat.class, + properties); + case SINGLE_ASC_LONG: + return new TableDesc(BinarySortableLongAscSerDe.class, + SequenceFileInputFormat.class, SequenceFileOutputFormat.class, + properties); + default: + assert (false); + return null; + } } /** @@ -531,14 +561,25 @@ TableDesc valueTable = null; ArrayList outputKeyCols = new ArrayList(); ArrayList outputValCols = new ArrayList(); + + SortableSerdeType serdeType; + if (keyCols.size() == 1 && keyCols.get(0).getTypeInfo().equals(TypeInfoFactory.stringTypeInfo) + && distinctColIndices.size() == 0 && (order.isEmpty() || order.charAt(0) == '+')) { + serdeType = SortableSerdeType.SINGLE_ASC_STRING; + } else if (keyCols.size() == 1 && keyCols.get(0).getTypeInfo().equals(TypeInfoFactory.longTypeInfo) + && distinctColIndices.size() == 0 && (order.isEmpty() || order.charAt(0) == '+')){ + serdeType = SortableSerdeType.SINGLE_ASC_LONG; + } else { + serdeType = SortableSerdeType.GENERAL; + } if (includeKeyCols) { keyTable = getReduceKeyTableDesc(getFieldSchemasFromColumnListWithLength( keyCols, distinctColIndices, outputKeyColumnNames, numKeys, ""), - order); + order, serdeType); outputKeyCols.addAll(outputKeyColumnNames); } else { keyTable = getReduceKeyTableDesc(getFieldSchemasFromColumnList( - keyCols, "reducesinkkey"),order); + keyCols, "reducesinkkey"), order, serdeType); for (int i = 0; i < keyCols.size(); i++) { outputKeyCols.add("reducesinkkey" + i); } Index: ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (revision 10359) +++ ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java (working copy) @@ -82,12 +82,12 @@ import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.Rule; import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.metadata.DummyPartition; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.ql.metadata.InvalidTableException; import org.apache.hadoop.hive.ql.metadata.Partition; -import org.apache.hadoop.hive.ql.metadata.DummyPartition; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1; @@ -145,6 +145,7 @@ import org.apache.hadoop.hive.ql.plan.UDTFDesc; import org.apache.hadoop.hive.ql.plan.UnionDesc; import org.apache.hadoop.hive.ql.plan.FilterDesc.sampleDesc; +import org.apache.hadoop.hive.ql.plan.PlanUtils.SortableSerdeType; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState.ResourceType; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; @@ -4632,9 +4633,23 @@ Operator oi = (i == 0 && right[i] == null ? left : right[i]); ReduceSinkDesc now = ((ReduceSinkOperator) (oi)).getConf(); + SortableSerdeType serdeType; + ArrayList keyCols = now.getKeyCols(); + String order = now.getOrder(); + + if (keyCols.size() == 1 && keyCols.get(0).getTypeInfo().equals(TypeInfoFactory.stringTypeInfo) + && (order.isEmpty() || order.charAt(0) == '+')) { + serdeType = SortableSerdeType.SINGLE_ASC_STRING; + } else if (keyCols.size() == 1 && keyCols.get(0).getTypeInfo().equals(TypeInfoFactory.longTypeInfo) + && (order.isEmpty() || order.charAt(0) == '+')) { + serdeType = SortableSerdeType.SINGLE_ASC_LONG; + } else { + serdeType = SortableSerdeType.GENERAL; + } + now.setKeySerializeInfo(PlanUtils.getReduceKeyTableDesc(PlanUtils .getFieldSchemasFromColumnList(now.getKeyCols(), "joinkey"), now - .getOrder())); + .getOrder(), serdeType)); } }