diff --git a/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java b/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java index c0c6964..03378a9 100644 --- a/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java +++ b/orc/src/java/org/apache/orc/impl/ConvertTreeReaderFactory.java @@ -983,8 +983,34 @@ public void nextVector(ColumnVector previousVector, public void nextVector(ColumnVector previousVector, boolean[] isNull, final int batchSize) throws IOException { - // The DoubleColumnVector produced by FloatTreeReader is what we want. + // we get the DoubleColumnVector produced by float tree reader first, then iterate through + // the elements and make double -> float -> string -> double conversion to preserve the + // precision. When float tree reader reads float and assign it to double, java's widening + // conversion adds more precision which will break all comparisons. + // Example: float f = 74.72 + // double d = f ---> 74.72000122070312 + // Double.parseDouble(String.valueOf(f)) ---> 74.72 floatTreeReader.nextVector(previousVector, isNull, batchSize); + + DoubleColumnVector doubleColumnVector = (DoubleColumnVector) previousVector; + if (doubleColumnVector.isRepeating) { + if (doubleColumnVector.noNulls || !doubleColumnVector.isNull[0]) { + final float f = (float) doubleColumnVector.vector[0]; + doubleColumnVector.vector[0] = Double.parseDouble(String.valueOf(f)); + } + } else if (doubleColumnVector.noNulls){ + for (int i = 0; i < batchSize; i++) { + final float f = (float) doubleColumnVector.vector[i]; + doubleColumnVector.vector[i] = Double.parseDouble(String.valueOf(f)); + } + } else { + for (int i = 0; i < batchSize; i++) { + if (!doubleColumnVector.isNull[i]) { + final float f = (float) doubleColumnVector.vector[i]; + doubleColumnVector.vector[i] = Double.parseDouble(String.valueOf(f)); + } + } + } } } diff --git a/orc/src/test/org/apache/orc/impl/TestSchemaEvolution.java b/orc/src/test/org/apache/orc/impl/TestSchemaEvolution.java index 3cd0390..a9c64fa 100644 --- a/orc/src/test/org/apache/orc/impl/TestSchemaEvolution.java +++ b/orc/src/test/org/apache/orc/impl/TestSchemaEvolution.java @@ -17,16 +17,48 @@ */ package org.apache.orc.impl; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import java.io.File; import java.io.IOException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.RecordReader; import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.TestName; public class TestSchemaEvolution { + @Rule + public TestName testCaseName = new TestName(); + + Configuration conf; + Path testFilePath; + FileSystem fs; + Path workDir = new Path(System.getProperty("test.tmp.dir", + "target" + File.separator + "test" + File.separator + "tmp")); + + @Before + public void setup() throws Exception { + conf = new Configuration(); + fs = FileSystem.getLocal(conf); + testFilePath = new Path(workDir, "TestOrcFile." + + testCaseName.getMethodName() + ".orc"); + fs.delete(testFilePath, false); + } + @Test public void testDataTypeConversion1() throws IOException { TypeDescription fileStruct1 = TypeDescription.createStruct() @@ -104,4 +136,31 @@ public void testDataTypeConversion2() throws IOException { SchemaEvolution both2diffChar = new SchemaEvolution(fileStruct2, readerStruct2diffChar, null); assertTrue(both2diffChar.hasConversion()); } + + @Test + public void testFloatToDoubleEvolution() throws Exception { + testFilePath = new Path(workDir, "TestOrcFile." + + testCaseName.getMethodName() + ".orc"); + TypeDescription schema = TypeDescription.createFloat(); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000) + .bufferSize(10000)); + VectorizedRowBatch batch = new VectorizedRowBatch(1, 1024); + DoubleColumnVector dcv = new DoubleColumnVector(1024); + batch.cols[0] = dcv; + batch.reset(); + batch.size = 1; + dcv.vector[0] = 74.72f; + writer.addRowBatch(batch); + writer.close(); + + Reader reader = OrcFile.createReader(testFilePath, + OrcFile.readerOptions(conf).filesystem(fs)); + TypeDescription schemaOnRead = TypeDescription.createDouble(); + RecordReader rows = reader.rows(new Reader.Options().schema(schemaOnRead)); + batch = schemaOnRead.createRowBatch(); + rows.nextBatch(batch); + assertEquals(74.72, ((DoubleColumnVector) batch.cols[0]).vector[0], 0.00000000001); + rows.close(); + } } diff --git a/ql/src/test/queries/clientpositive/orc_schema_evolution_float.q b/ql/src/test/queries/clientpositive/orc_schema_evolution_float.q new file mode 100644 index 0000000..6316324 --- /dev/null +++ b/ql/src/test/queries/clientpositive/orc_schema_evolution_float.q @@ -0,0 +1,38 @@ +set hive.optimize.index.filter=false; +set hive.metastore.disallow.incompatible.col.type.changes=false; + +drop table float_text; +create table float_text(f float); +insert into float_text values(74.72); +insert into float_text values(0.22); +select f from float_text; +alter table float_text change column f f double; +select f from float_text; +select f from float_text where f=74.72; +select f from float_text where f=0.22; +alter table float_text change column f f decimal(14,5); +select f from float_text; +select f from float_text where f=74.72; +select f from float_text where f=0.22; + +create table float_orc(f float) stored as orc; +insert overwrite table float_orc select * from float_text; +select f from float_orc; +alter table float_orc change column f f double; +select f from float_orc; +select f from float_orc where f=74.72; +select f from float_orc where f=0.22; +set hive.optimize.index.filter=true; +select f from float_orc where f=74.72; +select f from float_orc where f=0.22; + +alter table float_orc change column f f decimal(14,5); +select f from float_orc; +select f from float_orc where f=74.72; +select f from float_orc where f=0.22; +set hive.optimize.index.filter=true; +select f from float_orc where f=74.72; +select f from float_orc where f=0.22; + +drop table float_text; +drop table float_orc; diff --git a/ql/src/test/results/clientpositive/orc_schema_evolution_float.q.out b/ql/src/test/results/clientpositive/orc_schema_evolution_float.q.out new file mode 100644 index 0000000..2654c4b --- /dev/null +++ b/ql/src/test/results/clientpositive/orc_schema_evolution_float.q.out @@ -0,0 +1,263 @@ +PREHOOK: query: drop table float_text +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table float_text +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table float_text(f float) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@float_text +POSTHOOK: query: create table float_text(f float) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@float_text +PREHOOK: query: insert into float_text values(74.72) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@float_text +POSTHOOK: query: insert into float_text values(74.72) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@float_text +POSTHOOK: Lineage: float_text.f EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into float_text values(0.22) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@float_text +POSTHOOK: query: insert into float_text values(0.22) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@float_text +POSTHOOK: Lineage: float_text.f EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: select f from float_text +PREHOOK: type: QUERY +PREHOOK: Input: default@float_text +#### A masked pattern was here #### +POSTHOOK: query: select f from float_text +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_text +#### A masked pattern was here #### +74.72 +0.22 +PREHOOK: query: alter table float_text change column f f double +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@float_text +PREHOOK: Output: default@float_text +POSTHOOK: query: alter table float_text change column f f double +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@float_text +POSTHOOK: Output: default@float_text +PREHOOK: query: select f from float_text +PREHOOK: type: QUERY +PREHOOK: Input: default@float_text +#### A masked pattern was here #### +POSTHOOK: query: select f from float_text +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_text +#### A masked pattern was here #### +74.72 +0.22 +PREHOOK: query: select f from float_text where f=74.72 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_text +#### A masked pattern was here #### +POSTHOOK: query: select f from float_text where f=74.72 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_text +#### A masked pattern was here #### +74.72 +PREHOOK: query: select f from float_text where f=0.22 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_text +#### A masked pattern was here #### +POSTHOOK: query: select f from float_text where f=0.22 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_text +#### A masked pattern was here #### +0.22 +PREHOOK: query: alter table float_text change column f f decimal(14,5) +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@float_text +PREHOOK: Output: default@float_text +POSTHOOK: query: alter table float_text change column f f decimal(14,5) +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@float_text +POSTHOOK: Output: default@float_text +PREHOOK: query: select f from float_text +PREHOOK: type: QUERY +PREHOOK: Input: default@float_text +#### A masked pattern was here #### +POSTHOOK: query: select f from float_text +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_text +#### A masked pattern was here #### +74.72000 +0.22000 +PREHOOK: query: select f from float_text where f=74.72 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_text +#### A masked pattern was here #### +POSTHOOK: query: select f from float_text where f=74.72 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_text +#### A masked pattern was here #### +74.72000 +PREHOOK: query: select f from float_text where f=0.22 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_text +#### A masked pattern was here #### +POSTHOOK: query: select f from float_text where f=0.22 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_text +#### A masked pattern was here #### +0.22000 +PREHOOK: query: create table float_orc(f float) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@float_orc +POSTHOOK: query: create table float_orc(f float) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@float_orc +PREHOOK: query: insert overwrite table float_orc select * from float_text +PREHOOK: type: QUERY +PREHOOK: Input: default@float_text +PREHOOK: Output: default@float_orc +POSTHOOK: query: insert overwrite table float_orc select * from float_text +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_text +POSTHOOK: Output: default@float_orc +POSTHOOK: Lineage: float_orc.f EXPRESSION [(float_text)float_text.FieldSchema(name:f, type:decimal(14,5), comment:null), ] +PREHOOK: query: select f from float_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +74.72 +0.22 +PREHOOK: query: alter table float_orc change column f f double +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@float_orc +PREHOOK: Output: default@float_orc +POSTHOOK: query: alter table float_orc change column f f double +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@float_orc +POSTHOOK: Output: default@float_orc +PREHOOK: query: select f from float_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +74.72 +0.22 +PREHOOK: query: select f from float_orc where f=74.72 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc where f=74.72 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +74.72 +PREHOOK: query: select f from float_orc where f=0.22 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc where f=0.22 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +0.22 +PREHOOK: query: select f from float_orc where f=74.72 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc where f=74.72 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +74.72 +PREHOOK: query: select f from float_orc where f=0.22 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc where f=0.22 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +0.22 +PREHOOK: query: alter table float_orc change column f f decimal(14,5) +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@float_orc +PREHOOK: Output: default@float_orc +POSTHOOK: query: alter table float_orc change column f f decimal(14,5) +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@float_orc +POSTHOOK: Output: default@float_orc +PREHOOK: query: select f from float_orc +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +74.72000 +0.22000 +PREHOOK: query: select f from float_orc where f=74.72 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc where f=74.72 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +74.72000 +PREHOOK: query: select f from float_orc where f=0.22 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc where f=0.22 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +0.22000 +PREHOOK: query: select f from float_orc where f=74.72 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc where f=74.72 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +74.72000 +PREHOOK: query: select f from float_orc where f=0.22 +PREHOOK: type: QUERY +PREHOOK: Input: default@float_orc +#### A masked pattern was here #### +POSTHOOK: query: select f from float_orc where f=0.22 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@float_orc +#### A masked pattern was here #### +0.22000 +PREHOOK: query: drop table float_text +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@float_text +PREHOOK: Output: default@float_text +POSTHOOK: query: drop table float_text +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@float_text +POSTHOOK: Output: default@float_text +PREHOOK: query: drop table float_orc +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@float_orc +PREHOOK: Output: default@float_orc +POSTHOOK: query: drop table float_orc +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@float_orc +POSTHOOK: Output: default@float_orc