Index: serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java (revision 9970) +++ serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarSerDe.java (working copy) @@ -94,7 +94,7 @@ java.util.ArrayList notSkipIDs = ColumnProjectionUtils.getReadColumnIDs(job); - cachedLazyStruct = new ColumnarStruct(cachedObjectInspector, notSkipIDs); + cachedLazyStruct = new ColumnarStruct(cachedObjectInspector, notSkipIDs, serdeParams.getNullSequence()); int size = serdeParams.getColumnTypes().size(); field = new BytesRefWritable[size]; Index: serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java (revision 9970) +++ serde/src/java/org/apache/hadoop/hive/serde2/columnar/ColumnarStruct.java (working copy) @@ -38,50 +38,50 @@ * Object get parsed at its initialize time when call * {@link #init(BytesRefArrayWritable cols)}, while LazyStruct parse fields in a * lazy way. - * + * */ public class ColumnarStruct { - /** - * The fields of the struct. - */ - LazyObject[] fields; - private static final Log LOG = LogFactory.getLog(ColumnarStruct.class); int[] prjColIDs = null; // list of projected column IDs + Text nullSequence; + int lengthNullSequence; + /** * Construct a ColumnarStruct object with the TypeInfo. It creates the first * level object at the first place - * + * * @param oi * the ObjectInspector representing the type of this LazyStruct. */ public ColumnarStruct(ObjectInspector oi) { - this(oi, null); + this(oi, null, null); } /** * Construct a ColumnarStruct object with the TypeInfo. It creates the first * level object at the first place - * + * * @param oi * the ObjectInspector representing the type of this LazyStruct. * @param notSkippedColumnIDs * the column ids that should not be skipped */ public ColumnarStruct(ObjectInspector oi, - ArrayList notSkippedColumnIDs) { + ArrayList notSkippedColumnIDs, Text nullSequence) { List fieldRefs = ((StructObjectInspector) oi) .getAllStructFieldRefs(); int num = fieldRefs.size(); - fields = new LazyObject[num]; - cachedByteArrayRef = new ByteArrayRef[num]; - rawBytesField = new BytesRefWritable[num]; - fieldSkipped = new boolean[num]; - inited = new boolean[num]; + fieldInfoList = new FieldInfo[num]; + + if (nullSequence != null) { + this.nullSequence = nullSequence; + this.lengthNullSequence = nullSequence.getLength(); + } + // if no columns is set to be skipped, add all columns in // 'notSkippedColumnIDs' if (notSkippedColumnIDs == null || notSkippedColumnIDs.size() == 0) { @@ -91,15 +91,10 @@ } for (int i = 0; i < num; i++) { - fields[i] = LazyFactory.createLazyObject(fieldRefs.get(i) - .getFieldObjectInspector()); - cachedByteArrayRef[i] = new ByteArrayRef(); - if (!notSkippedColumnIDs.contains(i)) { - fieldSkipped[i] = true; - inited[i] = true; - } else { - inited[i] = false; - } + fieldInfoList[i] = new FieldInfo( + LazyFactory.createLazyObject(fieldRefs.get(i) + .getFieldObjectInspector()), + !notSkippedColumnIDs.contains(i)); } // maintain a list of non-NULL column IDs @@ -117,73 +112,110 @@ /** * Get one field out of the struct. - * + * * If the field is a primitive field, return the actual object. Otherwise * return the LazyObject. This is because PrimitiveObjectInspector does not * have control over the object used by the user - the user simply directly * use the Object instead of going through Object * PrimitiveObjectInspector.get(Object). - * + * * NOTE: separator and nullSequence has to be the same each time this method * is called. These two parameters are used only once to parse each record. - * + * * @param fieldID * The field ID * @param nullSequence * The sequence for null value * @return The field as a LazyObject */ - public Object getField(int fieldID, Text nullSequence) { - return uncheckedGetField(fieldID, nullSequence); + public Object getField(int fieldID) { + return fieldInfoList[fieldID].uncheckedGetField(); } - /* - * use an array instead of only one object in case in future hive does not do - * the byte copy. - */ - ByteArrayRef[] cachedByteArrayRef = null; - BytesRefWritable[] rawBytesField = null; - boolean[] inited = null; - boolean[] fieldSkipped = null; + class FieldInfo { + LazyObject field; + /* + * use an array instead of only one object in case in future hive does not do + * the byte copy. + */ + ByteArrayRef cachedByteArrayRef; + BytesRefWritable rawBytesField; + boolean inited; + boolean fieldSkipped; - /** - * Get the field out of the row without checking parsed. This is called by - * both getField and getFieldsAsList. - * - * @param fieldID - * The id of the field starting from 0. - * @param nullSequence - * The sequence representing NULL value. - * @return The value of the field - */ - protected Object uncheckedGetField(int fieldID, Text nullSequence) { - if (fieldSkipped[fieldID]) { - return null; - } - if (!inited[fieldID]) { - BytesRefWritable passedInField = rawBytesField[fieldID]; - try { - cachedByteArrayRef[fieldID].setData(passedInField.getData()); - } catch (IOException e) { - throw new RuntimeException(e); + public FieldInfo(LazyObject lazyObject, boolean fieldSkipped) { + field = lazyObject; + cachedByteArrayRef = new ByteArrayRef(); + if (fieldSkipped) { + this.fieldSkipped = true; + inited = true; + } else { + inited = false; } - fields[fieldID].init(cachedByteArrayRef[fieldID], passedInField - .getStart(), passedInField.getLength()); - inited[fieldID] = true; } - byte[] data = cachedByteArrayRef[fieldID].getData(); - int fieldLen = rawBytesField[fieldID].length; + /* + * ============================ [PERF] =================================== + * This function is called for every row. Setting up the selected/projected + * columns at the first call, and don't do that for the following calls. + * Ideally this should be done in the constructor where we don't need to + * branch in the function for each row. + * ========================================================================= + */ + public void init(BytesRefWritable col) { + if (col != null) { + rawBytesField= col; + inited = false; + } else { + // select columns that actually do not exist in the file. + fieldSkipped = true; + } + } - if (fieldLen == nullSequence.getLength() - && LazyUtils.compare(data, rawBytesField[fieldID].getStart(), fieldLen, - nullSequence.getBytes(), 0, nullSequence.getLength()) == 0) { - return null; + /** + * Get the field out of the row without checking parsed. This is called by + * both getField and getFieldsAsList. + * + * @param fieldID + * The id of the field starting from 0. + * @param nullSequence + * The sequence representing NULL value. + * @return The value of the field + */ + protected Object uncheckedGetField() { + if (fieldSkipped) { + return null; + } + if (!inited) { + try { + cachedByteArrayRef.setData(rawBytesField.getData()); + } catch (IOException e) { + throw new RuntimeException(e); + } + field.init(cachedByteArrayRef, rawBytesField + .getStart(), rawBytesField.getLength()); + inited = true; + } + + + int fieldLen = rawBytesField.length; + if (fieldLen == lengthNullSequence) { + byte[] data = cachedByteArrayRef.getData(); + + if (LazyUtils.compare(data, rawBytesField.getStart(), fieldLen, + nullSequence.getBytes(), 0, lengthNullSequence) == 0) { + return null; + } + } + + return field.getObject(); + } - - return fields[fieldID].getObject(); } + FieldInfo[] fieldInfoList = null; + + /* * ============================ [PERF] =================================== * This function is called for every row. Setting up the selected/projected @@ -196,11 +228,10 @@ for (int i = 0; i < prjColIDs.length; ++i) { int fieldIndex = prjColIDs[i]; if (fieldIndex < cols.size()) { - rawBytesField[fieldIndex] = cols.unCheckedGet(fieldIndex); - inited[fieldIndex] = false; + fieldInfoList[fieldIndex].init(cols.unCheckedGet(fieldIndex)); } else { // select columns that actually do not exist in the file. - fieldSkipped[fieldIndex] = true; + fieldInfoList[fieldIndex].init(null); } } } @@ -209,19 +240,19 @@ /** * Get the values of the fields as an ArrayList. - * + * * @param nullSequence * The sequence for the NULL value * @return The values of the fields as an ArrayList. */ - public ArrayList getFieldsAsList(Text nullSequence) { + public ArrayList getFieldsAsList() { if (cachedList == null) { cachedList = new ArrayList(); } else { cachedList.clear(); } - for (int i = 0; i < fields.length; i++) { - cachedList.add(uncheckedGetField(i, nullSequence)); + for (int i = 0; i < fieldInfoList.length; i++) { + cachedList.add(fieldInfoList[i].uncheckedGetField()); } return cachedList; } Index: serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java =================================================================== --- serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java (revision 9970) +++ serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ColumnarStructObjectInspector.java (working copy) @@ -29,10 +29,10 @@ /** * ColumnarStructObjectInspector works on struct data that is stored in * ColumnarStruct. - * + * * The names of the struct fields and the internal structure of the struct * fields are specified in the ctor of the ColumnarStructObjectInspector. - * + * * Always use the ObjectInspectorFactory to create new ObjectInspector objects, * instead of directly creating an instance of this class. */ @@ -144,7 +144,7 @@ int fieldID = f.getFieldID(); assert (fieldID >= 0 && fieldID < fields.size()); - return struct.getField(fieldID, nullSequence); + return struct.getField(fieldID); } @Override @@ -153,6 +153,6 @@ return null; } ColumnarStruct struct = (ColumnarStruct) data; - return struct.getFieldsAsList(nullSequence); + return struct.getFieldsAsList(); } } Index: ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java (revision 9970) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/ExprNodeColumnEvaluator.java (working copy) @@ -33,6 +33,9 @@ protected ExprNodeColumnDesc expr; + transient boolean simpleCase; + transient StructObjectInspector inspector; + transient StructField field; transient StructObjectInspector[] inspectors; transient StructField[] fields; transient boolean[] unionField; @@ -47,40 +50,53 @@ // We need to support field names like KEY.0, VALUE.1 between // map-reduce boundary. String[] names = expr.getColumn().split("\\."); - inspectors = new StructObjectInspector[names.length]; - fields = new StructField[names.length]; - unionField = new boolean[names.length]; - int unionIndex = -1; + String[] unionfields = names[0].split("\\:"); + if (names.length == 1 && unionfields.length == 1) { + simpleCase = true; + inspector = (StructObjectInspector) rowInspector; + field = inspector.getStructFieldRef(names[0]); + return field.getFieldObjectInspector(); + } + else { + simpleCase = false; + inspectors = new StructObjectInspector[names.length]; + fields = new StructField[names.length]; + unionField = new boolean[names.length]; + int unionIndex = -1; - for (int i = 0; i < names.length; i++) { - if (i == 0) { - inspectors[0] = (StructObjectInspector) rowInspector; - } else { - if (unionIndex != -1) { - inspectors[i] = (StructObjectInspector) ( - (UnionObjectInspector)fields[i-1].getFieldObjectInspector()). - getObjectInspectors().get(unionIndex); + for (int i = 0; i < names.length; i++) { + if (i == 0) { + inspectors[0] = (StructObjectInspector) rowInspector; } else { - inspectors[i] = (StructObjectInspector) fields[i - 1] - .getFieldObjectInspector(); + if (unionIndex != -1) { + inspectors[i] = (StructObjectInspector) ( + (UnionObjectInspector)fields[i-1].getFieldObjectInspector()). + getObjectInspectors().get(unionIndex); + } else { + inspectors[i] = (StructObjectInspector) fields[i - 1] + .getFieldObjectInspector(); + } } + // to support names like _colx:1._coly + unionfields = names[i].split("\\:"); + fields[i] = inspectors[i].getStructFieldRef(unionfields[0]); + if (unionfields.length > 1) { + unionIndex = Integer.parseInt(unionfields[1]); + unionField[i] = true; + } else { + unionIndex = -1; + unionField[i] = false; + } } - // to support names like _colx:1._coly - String[] unionfields = names[i].split("\\:"); - fields[i] = inspectors[i].getStructFieldRef(unionfields[0]); - if (unionfields.length > 1) { - unionIndex = Integer.parseInt(unionfields[1]); - unionField[i] = true; - } else { - unionIndex = -1; - unionField[i] = false; - } + return fields[names.length - 1].getFieldObjectInspector(); } - return fields[names.length - 1].getFieldObjectInspector(); } @Override public Object evaluate(Object row) throws HiveException { + if (simpleCase) { + return inspector.getStructFieldData(row, field); + } Object o = row; for (int i = 0; i < fields.length; i++) { o = inspectors[i].getStructFieldData(o, fields[i]);