Details
-
Bug
-
Status: Resolved
-
Minor
-
Resolution: Fixed
-
None
-
None
-
spark-2.1
Description
Data is getting garbled after datamap creation when table is created with BATCH_SORT/GLOBAL_SORT.
Steps to reproduce :
spark.sql("drop table if exists uniqdata_batchsort_compact3")
spark.sql("CREATE TABLE uniqdata_batchsort_compact3 (CUST_ID int,CUST_NAME String,ACTIVE_EMUI_VERSION string, DOB timestamp, DOJ timestamp, BIGINT_COLUMN1 bigint,BIGINT_COLUMN2 bigint,DECIMAL_COLUMN1 decimal(30,10), DECIMAL_COLUMN2 decimal(36,10),Double_COLUMN1 double, Double_COLUMN2 double,INTEGER_COLUMN1 int) STORED BY 'carbondata' TBLPROPERTIES('SORT_SCOPE'='GLOBAL_SORT')").show()
spark.sql("LOAD DATA INPATH '/home/sangeeta/Desktop/2000_UniqData.csv' into table " +
"uniqdata_batchsort_compact3 OPTIONS('DELIMITER'=',' , 'QUOTECHAR'='\"'," +
"'BAD_RECORDS_ACTION'='FORCE','FILEHEADER'='CUST_ID,CUST_NAME,ACTIVE_EMUI_VERSION," +
"DOB,DOJ,BIGINT_COLUMN1,BIGINT_COLUMN2,DECIMAL_COLUMN1,DECIMAL_COLUMN2," +
"Double_COLUMN1,Double_COLUMN2,INTEGER_COLUMN1','batch_sort_size_inmb'='1')")
spark.sql("LOAD DATA INPATH '/home/sangeeta/Desktop/2000_UniqData.csv' into table " +
"uniqdata_batchsort_compact3 OPTIONS('DELIMITER'=',' , 'QUOTECHAR'='\"'," +
"'BAD_RECORDS_ACTION'='FORCE','FILEHEADER'='CUST_ID,CUST_NAME,ACTIVE_EMUI_VERSION," +
"DOB,DOJ,BIGINT_COLUMN1,BIGINT_COLUMN2,DECIMAL_COLUMN1,DECIMAL_COLUMN2," +
"Double_COLUMN1,Double_COLUMN2,INTEGER_COLUMN1','batch_sort_size_inmb'='1')")
spark.sql("LOAD DATA INPATH '/home/sangeeta/Desktop/2000_UniqData.csv' into table " +
"uniqdata_batchsort_compact3 OPTIONS('DELIMITER'=',' , 'QUOTECHAR'='\"'," +
"'BAD_RECORDS_ACTION'='FORCE','FILEHEADER'='CUST_ID,CUST_NAME,ACTIVE_EMUI_VERSION," +
"DOB,DOJ,BIGINT_COLUMN1,BIGINT_COLUMN2,DECIMAL_COLUMN1,DECIMAL_COLUMN2," +
"Double_COLUMN1,Double_COLUMN2,INTEGER_COLUMN1','batch_sort_size_inmb'='1')")
spark.sql("select cust_id, avg(cust_id) from uniqdata_batchsort_compact3 group by cust_id ").show(50)
-----------------+
cust_id | avg(cust_id) |
-----------------+
9376 | 9376.0 |
9427 | 9427.0 |
9465 | 9465.0 |
9852 | 9852.0 |
9900 | 9900.0 |
10206 | 10206.0 |
10362 | 10362.0 |
10623 | 10623.0 |
10817 | 10817.0 |
9182 | 9182.0 |
9564 | 9564.0 |
9879 | 9879.0 |
10081 | 10081.0 |
10121 | 10121.0 |
10230 | 10230.0 |
10462 | 10462.0 |
10703 | 10703.0 |
10914 | 10914.0 |
9162 | 9162.0 |
9383 | 9383.0 |
9454 | 9454.0 |
9517 | 9517.0 |
9558 | 9558.0 |
10708 | 10708.0 |
10798 | 10798.0 |
10862 | 10862.0 |
9071 | 9071.0 |
9169 | 9169.0 |
9946 | 9946.0 |
10468 | 10468.0 |
10745 | 10745.0 |
10768 | 10768.0 |
9153 | 9153.0 |
9206 | 9206.0 |
9403 | 9403.0 |
9597 | 9597.0 |
9647 | 9647.0 |
9775 | 9775.0 |
10032 | 10032.0 |
10395 | 10395.0 |
10527 | 10527.0 |
10567 | 10567.0 |
10632 | 10632.0 |
10788 | 10788.0 |
10815 | 10815.0 |
10840 | 10840.0 |
9181 | 9181.0 |
9344 | 9344.0 |
9575 | 9575.0 |
9675 | 9675.0 |
-----------------+
only showing top 50 rows
Note: Here the cust_id is coming correct .
spark.sql("create datamap uniqdata_agg on table uniqdata_batchsort_compact3 using " +
"'preaggregate' as select avg(cust_id) from uniqdata_batchsort_compact3 group by cust_id")
spark.sql("select cust_id, avg(cust_id) from uniqdata_batchsort_compact3 group by cust_id ").show(50)
-----------------+
cust_id | avg(cust_id) |
-----------------+
27651 | 9217.0 |
31944 | 10648.0 |
32667 | 10889.0 |
28242 | 9414.0 |
29841 | 9947.0 |
28728 | 9576.0 |
27255 | 9085.0 |
32571 | 10857.0 |
30276 | 10092.0 |
27276 | 9092.0 |
31503 | 10501.0 |
27687 | 9229.0 |
27183 | 9061.0 |
29334 | 9778.0 |
29913 | 9971.0 |
28683 | 9561.0 |
31545 | 10515.0 |
30405 | 10135.0 |
27693 | 9231.0 |
29649 | 9883.0 |
30537 | 10179.0 |
32709 | 10903.0 |
29586 | 9862.0 |
32895 | 10965.0 |
32415 | 10805.0 |
31644 | 10548.0 |
30030 | 10010.0 |
31713 | 10571.0 |
28083 | 9361.0 |
27813 | 9271.0 |
27171 | 9057.0 |
27189 | 9063.0 |
30444 | 10148.0 |
28623 | 9541.0 |
28566 | 9522.0 |
32655 | 10885.0 |
31164 | 10388.0 |
30321 | 10107.0 |
31452 | 10484.0 |
29829 | 9943.0 |
27468 | 9156.0 |
31212 | 10404.0 |
32154 | 10718.0 |
27531 | 9177.0 |
27654 | 9218.0 |
27105 | 9035.0 |
31113 | 10371.0 |
28479 | 9493.0 |
29094 | 9698.0 |
31551 | 10517.0 |
-----------------+
only showing top 50 rows
Note: But after datamap creation, cust_id is coming incorrect. It is coming as thrice(equivalent to number of loads) of its original value and avg(cust_id) is correct.
Attachments
Attachments
Issue Links
- links to