Uploaded image for project: 'IMPALA'
  1. IMPALA
  2. IMPALA-4302

In-predicate filters do not scale as expected with # of elements constant IN-list

    Details

    • Docs Text:
      Performance of in() expressions with many constant arguments was improved significantly. E.g. a list of 100 items could see a 10x performance improvement. This changes also improved performance of other functions with many constant arguments.
    • Target Version:

      Description

      ExprContext::GetValue is called per item in a constant IN-list which is not needed, this makes in-list predicates >10x slower than an equivalent filter through RuntimeFilter BloomFilter and query run time grows linearly with number of items in the constant IN-list.

      The code below is responsible for the slowdown, getting the value of each item in the list per row is unnecessary

      void ScalarFnCall::EvaluateChildren(ExprContext* context, const TupleRow* row,
                                          vector<AnyVal*>* input_vals) {
        DCHECK_EQ(input_vals->size(), NumFixedArgs());
        FunctionContext* fn_ctx = context->fn_context(fn_context_index_);
        uint8_t* varargs_buffer = fn_ctx->impl()->varargs_buffer();
        for (int i = 0; i < children_.size(); ++i) {
          void* src_slot = context->GetValue(children_[i], row);
          AnyVal* dst_val;
          if (vararg_start_idx_ == -1 || i < vararg_start_idx_) {
            dst_val = (*input_vals)[i];
          } else {
            dst_val = reinterpret_cast<AnyVal*>(varargs_buffer);
            varargs_buffer += AnyValUtil::AnyValSize(children_[i]->type());
          }
          AnyValUtil::SetAnyVal(src_slot, children_[i]->type(), dst_val);
        }
      }
      

      Repro

      create table in_list_keys stored as parquet as  select distinct c_custkey from customer where c_custkey in (1,6,11,16,21,26,31,36,41,46,51,56,61,66,71,76,81,86,91,96,101,106,111,116,121,126,131,136,141,146,151,156,161,166,171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246,251,256,261,266,271,276,281,286,291,296,301,306,311,316,321,326,331,336,341,346,351,356,361,366,371,376,381,386,391,396,401,406,411,416,421,426,431,436,441,446,451,456,461,466,471,476,481,486,491,496,501,506,511,516,521,526,531,536,541,546,551,556,561,566,571,576,581,586,591,596,601,606,611,616,621,626,631,636,641,646,651,656,661,666,671,676,681,686,691,696,701,706,711,716,721,726,731,736,741,746,751,756,761,766,771,776,781,786,791,796,801,806,811,816,821,826,831,836,841,846,851,856,861,866,871,876,881,886,891,896,901,906,911,916,921,926,931,936,941,946,951,956,961,966,971,976,981,986,991,996,1001,1006,1011,1016,1021,1026,1031,1036,1041,1046,1051,1056,1061,1066,1071,1076,1081,1086,1091,1096,1101,1106,1111,1116,1121,1126,1131,1136,1141,1146,1151,1156,1161,1166,1171,1176,1181,1186,1191,1196,1201,1206,1211,1216,1221,1226,1231,1236,1241,1246,1251,1256,1261,1266,1271,1276,1281,1286,1291,1296,1301,1306,1311,1316,1321,1326,1331,1336,1341,1346,1351,1356,1361,1366,1371,1376,1381,1386,1391,1396,1401,1406,1411,1416,1421,1426,1431,1436,1441,1446,1451,1456,1461,1466,1471,1476,1481,1486,1491,1496,1501,1506,1511,1516,1521,1526,1531,1536,1541,1546,1551,1556,1561,1566,1571,1576,1581,1586,1591,1596,1601,1606,1611,1616,1621,1626,1631,1636,1641,1646,1651,1656,1661,1666,1671,1676,1681,1686,1691,1696,1701,1706,1711,1716,1721,1726,1731,1736,1741,1746,1751,1756,1761,1766,1771,1776,1781,1786,1791,1796,1801,1806,1811,1816,1821,1826,1831,1836,1841,1846,1851,1856,1861,1866,1871,1876,1881,1886,1891,1896,1901,1906,1911,1916,1921,1926,1931,1936,1941,1946,1951,1956,1961,1966,1971,1976,1981,1986,1991,1996,2001,2006,2011,2016,2021,2026,2031,2036,2041,2046,2051,2056,2061,2066,2071,2076,2081,2086,2091,2096,2101,2106,2111,2116,2121,2126,2131,2136,2141,2146,2151,2156,2161,2166,2171,2176,2181,2186,2191,2196,2201,2206,2211,2216,2221,2226,2231,2236,2241,2246,2251,2256,2261,2266,2271,2276,2281,2286);
      compute stats in_list_keys;
      

      Slow in-list query which finished in 23 seconds for 288 Million rows of Lineitem

      select count(*) from lineitem where l_partkey in (1,6,11,16,21,26,31,36,41,46,51,56,61,66,71,76,81,86,91,96,101,106,111,116,121,126,131,136,141,146,151,156,161,166,171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246,251,256,261,266,271,276,281,286,291,296,301,306,311,316,321,326,331,336,341,346,351,356,361,366,371,376,381,386,391,396,401,406,411,416,421,426,431,436,441,446,451,456,461,466,471,476,481,486,491,496,501,506,511,516,521,526,531,536,541,546,551,556,561,566,571,576,581,586,591,596,601,606,611,616,621,626,631,636,641,646,651,656,661,666,671,676,681,686,691,696,701,706,711,716,721,726,731,736,741,746,751,756,761,766,771,776,781,786,791,796,801,806,811,816,821,826,831,836,841,846,851,856,861,866,871,876,881,886,891,896,901,906,911,916,921,926,931,936,941,946,951,956,961,966,971,976,981,986,991,996,1001,1006,1011,1016,1021,1026,1031,1036,1041,1046,1051,1056,1061,1066,1071,1076,1081,1086,1091,1096,1101,1106,1111,1116,1121,1126,1131,1136,1141,1146,1151,1156,1161,1166,1171,1176,1181,1186,1191,1196,1201,1206,1211,1216,1221,1226,1231,1236,1241,1246,1251,1256,1261,1266,1271,1276,1281,1286,1291,1296,1301,1306,1311,1316,1321,1326,1331,1336,1341,1346,1351,1356,1361,1366,1371,1376,1381,1386,1391,1396,1401,1406,1411,1416,1421,1426,1431,1436,1441,1446,1451,1456,1461,1466,1471,1476,1481,1486,1491,1496,1501,1506,1511,1516,1521,1526,1531,1536,1541,1546,1551,1556,1561,1566,1571,1576,1581,1586,1591,1596,1601,1606,1611,1616,1621,1626,1631,1636,1641,1646,1651,1656,1661,1666,1671,1676,1681,1686,1691,1696,1701,1706,1711,1716,1721,1726,1731,1736,1741,1746,1751,1756,1761,1766,1771,1776,1781,1786,1791,1796,1801,1806,1811,1816,1821,1826,1831,1836,1841,1846,1851,1856,1861,1866,1871,1876,1881,1886,1891,1896,1901,1906,1911,1916,1921,1926,1931,1936,1941,1946,1951,1956,1961,1966,1971,1976,1981,1986,1991,1996,2001,2006,2011,2016,2021,2026,2031,2036,2041,2046,2051,2056,2061,2066,2071,2076,2081,2086,2091,2096,2101,2106,2111,2116,2121,2126,2131,2136,2141,2146,2151,2156,2161,2166,2171,2176,2181,2186,2191,2196,2201,2206,2211,2216,2221,2226,2231,2236,2241,2246,2251,2256,2261,2266,2271,2276,2281,2286);
      

      Much faster query which uses BloomFilter scan operator takes 2s769ms.

      select count(*) from lineitem_large ,in_list_keys ik where l_partkey=c_custkey;
      

      Profile snippet for scan with runtime filters

              HDFS_SCAN_NODE (id=0):(Total: 2s769ms, non-child: 2s769ms, % non-child: 100.00%)
                ExecOption: Expr Evaluation Codegen Disabled, PARQUET Codegen Enabled, Codegen enabled: 44 out of 44
                Hdfs split stats (<volume id>:<# splits>/<split lengths>): 0:44/9.95 GB 
                Runtime filters: All filters arrived. Waited 0
                Hdfs Read Thread Concurrency Bucket: 0:87.5% 1:12.5% 2:0% 3:0% 4:0% 
                File Formats: PARQUET/SNAPPY:44 
                BytesRead(500.000ms): 90.83 MB, 260.69 MB, 453.00 MB, 648.54 MB, 834.41 MB, 1.02 GB, 1.18 GB, 1.34 GB
                 - AverageHdfsReadThreadConcurrency: 0.12 
                 - AverageScannerThreadConcurrency: 3.62 
                 - BytesRead: 1.34 GB (1433770420)
                 - BytesReadDataNodeCache: 0
                 - BytesReadLocal: 1.34 GB (1433770420)
                 - BytesReadRemoteUnexpected: 0
                 - BytesReadShortCircuit: 1.34 GB (1433770420)
                 - DecompressionTime: 2s702ms
                 - MaxCompressedTextFileLength: 0
                 - NumColumns: 1 (1)
                 - NumDisksAccessed: 1 (1)
                 - NumRowGroups: 44 (44)
                 - NumScannerThreadsStarted: 4 (4)
                 - PeakMemoryUsage: 163.15 MB (171075512)
                 - PerReadThreadRawHdfsThroughput: 3.02 GB/sec
                 - RemoteScanRanges: 0 (0)
                 - RowBatchQueueGetWaitTime: 2s762ms
                 - RowBatchQueuePutWaitTime: 0.000ns
                 - RowsRead: 288.00M (288001184)
                 - RowsReturned: 109.31K (109312)
                 - RowsReturnedRate: 39.47 K/sec
                 - ScanRangesComplete: 44 (44)
                 - ScannerThreadsInvoluntaryContextSwitches: 906 (906)
                 - ScannerThreadsTotalWallClockTime: 13s869ms
                   - MaterializeTupleTime(*): 10s721ms
                   - ScannerThreadsSysTime: 731.018ms
                   - ScannerThreadsUserTime: 12s779ms
                 - ScannerThreadsVoluntaryContextSwitches: 274 (274)
                 - TotalRawHdfsReadTime(*): 442.479ms
                 - TotalReadThroughput: 341.84 MB/sec
                Filter 0 (1.00 MB):
                   - Rows processed: 288.00M (288001184)
                   - Rows rejected: 287.89M (287891872)
                   - Rows total: 288.00M (288001184)
      

      Profile snippet for scan with slow in-predicate

      HDFS_SCAN_NODE (id=0):(Total: 22s469ms, non-child: 22s469ms, % non-child: 100.00%)
                ExecOption: Expr Evaluation Codegen Enabled, PARQUET Codegen Enabled, Codegen enabled: 44 out of 44
                Hdfs split stats (<volume id>:<# splits>/<split lengths>): 0:44/9.95 GB 
                Hdfs Read Thread Concurrency Bucket: 0:97.83% 1:2.174% 2:0% 3:0% 4:0% 
                File Formats: PARQUET/SNAPPY:44 
                BytesRead(500.000ms): 0, 137.86 MB, 137.86 MB, 137.86 MB, 137.86 MB, 249.11 MB, 249.11 MB, 249.11 MB, 308.89 MB, 376.81 MB, 376.81 MB, 410.85 MB, 444.79 MB, 512.72 MB, 512.72 MB, 546.69 MB, 546.69 MB, 546.69 MB, 648.54 MB, 648.54 MB, 682.52 MB, 682.52 MB, 750.52 MB, 784.45 MB, 805.28 MB, 805.28 MB, 873.21 MB, 929.70 MB, 929.70 MB, 937.80 MB, 989.49 MB, 1.03 GB, 1.03 GB, 1.03 GB, 1.04 GB, 1.17 GB, 1.17 GB, 1.17 GB, 1.17 GB, 1.20 GB, 1.24 GB, 1.27 GB, 1.30 GB, 1.34 GB, 1.34 GB, 1.34 GB, 1.34 GB
                 - AverageHdfsReadThreadConcurrency: 0.02 
                 - AverageScannerThreadConcurrency: 3.93 
                 - BytesRead: 1.34 GB (1433770420)
                 - BytesReadDataNodeCache: 0
                 - BytesReadLocal: 1.34 GB (1433770420)
                 - BytesReadRemoteUnexpected: 0
                 - BytesReadShortCircuit: 1.34 GB (1433770420)
                 - DecompressionTime: 2s549ms
                 - MaxCompressedTextFileLength: 0
                 - NumColumns: 1 (1)
                 - NumDisksAccessed: 1 (1)
                 - NumRowGroups: 44 (44)
                 - NumScannerThreadsStarted: 4 (4)
                 - PeakMemoryUsage: 167.18 MB (175303488)
                 - PerReadThreadRawHdfsThroughput: 570.71 MB/sec
                 - RemoteScanRanges: 0 (0)
                 - RowBatchQueueGetWaitTime: 22s454ms
                 - RowBatchQueuePutWaitTime: 0.000ns
                 - RowsRead: 288.00M (288001184)
                 - RowsReturned: 109.31K (109312)
                 - RowsReturnedRate: 4.86 K/sec
                 - ScanRangesComplete: 44 (44)
                 - ScannerThreadsInvoluntaryContextSwitches: 6.02K (6019)
                 - ScannerThreadsTotalWallClockTime: 1m31s
                   - MaterializeTupleTime(*): 1m27s
                   - ScannerThreadsSysTime: 939.121ms
                   - ScannerThreadsUserTime: 1m29s
                 - ScannerThreadsVoluntaryContextSwitches: 185 (185)
                 - TotalRawHdfsReadTime(*): 2s395ms
                 - TotalReadThroughput: 58.19 MB/sec
      

        Activity

        Hide
        tarmstrong Tim Armstrong added a comment -

        IMPALA-3629 should avoid the interpreted overhead here. Is it still a problem in 2.8?

        Show
        tarmstrong Tim Armstrong added a comment - IMPALA-3629 should avoid the interpreted overhead here. Is it still a problem in 2.8?
        Hide
        mmokhtar Mostafa Mokhtar added a comment -

        Yes, this on 2.8 and the build has IMPALA-3629.

        Show
        mmokhtar Mostafa Mokhtar added a comment - Yes, this on 2.8 and the build has IMPALA-3629 .
        Hide
        mmokhtar Mostafa Mokhtar added a comment -

        It appears that the impala::ExprContext::GetValue is called per item in the in-list per row.

        Query with 450 items in in-list finished in 22 seconds

        select count(*) from lineitem_large where l_partkey in (1,6,11,16,21,26,31,36,41,46,51,56,61,66,71,76,81,86,91,96,101,106,111,116,121,126,131,136,141,146,151,156,161,166,171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246,251,256,261,266,271,276,281,286,291,296,301,306,311,316,321,326,331,336,341,346,351,356,361,366,371,376,381,386,391,396,401,406,411,416,421,426,431,436,441,446,451,456,461,466,471,476,481,486,491,496,501,506,511,516,521,526,531,536,541,546,551,556,561,566,571,576,581,586,591,596,601,606,611,616,621,626,631,636,641,646,651,656,661,666,671,676,681,686,691,696,701,706,711,716,721,726,731,736,741,746,751,756,761,766,771,776,781,786,791,796,801,806,811,816,821,826,831,836,841,846,851,856,861,866,871,876,881,886,891,896,901,906,911,916,921,926,931,936,941,946,951,956,961,966,971,976,981,986,991,996,1001,1006,1011,1016,1021,1026,1031,1036,1041,1046,1051,1056,1061,1066,1071,1076,1081,1086,1091,1096,1101,1106,1111,1116,1121,1126,1131,1136,1141,1146,1151,1156,1161,1166,1171,1176,1181,1186,1191,1196,1201,1206,1211,1216,1221,1226,1231,1236,1241,1246,1251,1256,1261,1266,1271,1276,1281,1286,1291,1296,1301,1306,1311,1316,1321,1326,1331,1336,1341,1346,1351,1356,1361,1366,1371,1376,1381,1386,1391,1396,1401,1406,1411,1416,1421,1426,1431,1436,1441,1446,1451,1456,1461,1466,1471,1476,1481,1486,1491,1496,1501,1506,1511,1516,1521,1526,1531,1536,1541,1546,1551,1556,1561,1566,1571,1576,1581,1586,1591,1596,1601,1606,1611,1616,1621,1626,1631,1636,1641,1646,1651,1656,1661,1666,1671,1676,1681,1686,1691,1696,1701,1706,1711,1716,1721,1726,1731,1736,1741,1746,1751,1756,1761,1766,1771,1776,1781,1786,1791,1796,1801,1806,1811,1816,1821,1826,1831,1836,1841,1846,1851,1856,1861,1866,1871,1876,1881,1886,1891,1896,1901,1906,1911,1916,1921,1926,1931,1936,1941,1946,1951,1956,1961,1966,1971,1976,1981,1986,1991,1996,2001,2006,2011,2016,2021,2026,2031,2036,2041,2046,2051,2056,2061,2066,2071,2076,2081,2086,2091,2096,2101,2106,2111,2116,2121,2126,2131,2136,2141,2146,2151,2156,2161,2166,2171,2176,2181,2186,2191,2196,2201,2206,2211,2216,2221,2226,2231,2236,2241,2246,2251,2256,2261,2266,2271,2276,2281,2286)
        Query submitted at: 2016-10-17 13:16:04 (Coordinator: http://mostafa-01:25000)
        Query progress can be monitored at: http://mostafa-01:25000/query_plan?query_id=164b72549d77d112:b176b22400000000
        +----------+
        | count(*) |
        +----------+
        | 109312   |
        +----------+
        Fetched 1 row(s) in 22.64s
        

        Query with 220 items in in-list finished in 13 seconds

        Query: select count(*) from lineitem_large where l_partkey in (1,6,11,16,21,26,31,36,41,46,51,56,61,66,71,76,81,86,91,96,101,106,111,116,121,126,131,136,141,146,151,156,161,166,171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246,251,256,261,266,271,276,281,286,291,296,301,306,311,316,321,326,331,336,341,346,351,356,361,366,371,376,381,386,391,396,401,406,411,416,421,426,431,436,441,446,451,456,461,466,471,476,481,486,491,496,501,506,511,516,521,526,531,536,541,546,551,556,561,566,571,576,581,586,591,596,601,606,611,616,621,626,631,636,641,646,651,656,661,666,671,676,681,686,691,696,701,706,711,716,721,726,731,736,741,746,751,756,761,766,771,776,781,786,791,796,801,806,811,816,821,826,831,836,841,846,851,856,861,866,871,876,881,886,891,896,901,906,911,916,921,926,931,936,941,946,951,956,961,966,971,976,981,986,991,996,1001,1006,1011,1016,1021,1026,1031,1036,1041,1046,1051,1056,1061,1066,1071,1076,1081,1086,1091,1096,1101)
        Query submitted at: 2016-10-17 13:16:27 (Coordinator: http://mostafa-01:25000)
        Query progress can be monitored at: http://mostafa-01:25000/query_plan?query_id=c44c3d10794c69ef:aba95b4f00000000
        +----------+
        | count(*) |
        +----------+
        | 52760    |
        +----------+
        Fetched 1 row(s) in 13.10s
        

        Query with 101 items in list finished in 8 seconds

        Query: select count(*) from lineitem_large where l_partkey in (1,6,11,16,21,26,31,36,41,46,51,56,61,66,71,76,81,86,91,96,101,106,111,116,121,126,131,136,141,146,151,156,161,166,171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246,251,256,261,266,271,276,281,286,291,296,301,306,311,316,321,326,331,336,341,346,351,356,361,366,371,376,381,386,391,396,401,406,411,416,421,426,431,436,441,446,451,456,461,466,471,476,481,486,491,496,501)
        Query submitted at: 2016-10-17 13:16:40 (Coordinator: http://mostafa-01:25000)
        Query progress can be monitored at: http://mostafa-01:25000/query_plan?query_id=874f3fe0304f35ee:62efcc200000000
        +----------+
        | count(*) |
        +----------+
        | 24416    |
        +----------+
        Fetched 1 row(s) in 7.97s
        
        Show
        mmokhtar Mostafa Mokhtar added a comment - It appears that the impala::ExprContext::GetValue is called per item in the in-list per row. Query with 450 items in in-list finished in 22 seconds select count(*) from lineitem_large where l_partkey in (1,6,11,16,21,26,31,36,41,46,51,56,61,66,71,76,81,86,91,96,101,106,111,116,121,126,131,136,141,146,151,156,161,166,171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246,251,256,261,266,271,276,281,286,291,296,301,306,311,316,321,326,331,336,341,346,351,356,361,366,371,376,381,386,391,396,401,406,411,416,421,426,431,436,441,446,451,456,461,466,471,476,481,486,491,496,501,506,511,516,521,526,531,536,541,546,551,556,561,566,571,576,581,586,591,596,601,606,611,616,621,626,631,636,641,646,651,656,661,666,671,676,681,686,691,696,701,706,711,716,721,726,731,736,741,746,751,756,761,766,771,776,781,786,791,796,801,806,811,816,821,826,831,836,841,846,851,856,861,866,871,876,881,886,891,896,901,906,911,916,921,926,931,936,941,946,951,956,961,966,971,976,981,986,991,996,1001,1006,1011,1016,1021,1026,1031,1036,1041,1046,1051,1056,1061,1066,1071,1076,1081,1086,1091,1096,1101,1106,1111,1116,1121,1126,1131,1136,1141,1146,1151,1156,1161,1166,1171,1176,1181,1186,1191,1196,1201,1206,1211,1216,1221,1226,1231,1236,1241,1246,1251,1256,1261,1266,1271,1276,1281,1286,1291,1296,1301,1306,1311,1316,1321,1326,1331,1336,1341,1346,1351,1356,1361,1366,1371,1376,1381,1386,1391,1396,1401,1406,1411,1416,1421,1426,1431,1436,1441,1446,1451,1456,1461,1466,1471,1476,1481,1486,1491,1496,1501,1506,1511,1516,1521,1526,1531,1536,1541,1546,1551,1556,1561,1566,1571,1576,1581,1586,1591,1596,1601,1606,1611,1616,1621,1626,1631,1636,1641,1646,1651,1656,1661,1666,1671,1676,1681,1686,1691,1696,1701,1706,1711,1716,1721,1726,1731,1736,1741,1746,1751,1756,1761,1766,1771,1776,1781,1786,1791,1796,1801,1806,1811,1816,1821,1826,1831,1836,1841,1846,1851,1856,1861,1866,1871,1876,1881,1886,1891,1896,1901,1906,1911,1916,1921,1926,1931,1936,1941,1946,1951,1956,1961,1966,1971,1976,1981,1986,1991,1996,2001,2006,2011,2016,2021,2026,2031,2036,2041,2046,2051,2056,2061,2066,2071,2076,2081,2086,2091,2096,2101,2106,2111,2116,2121,2126,2131,2136,2141,2146,2151,2156,2161,2166,2171,2176,2181,2186,2191,2196,2201,2206,2211,2216,2221,2226,2231,2236,2241,2246,2251,2256,2261,2266,2271,2276,2281,2286) Query submitted at: 2016-10-17 13:16:04 (Coordinator: http: //mostafa-01:25000) Query progress can be monitored at: http: //mostafa-01:25000/query_plan?query_id=164b72549d77d112:b176b22400000000 +----------+ | count(*) | +----------+ | 109312 | +----------+ Fetched 1 row(s) in 22.64s Query with 220 items in in-list finished in 13 seconds Query: select count(*) from lineitem_large where l_partkey in (1,6,11,16,21,26,31,36,41,46,51,56,61,66,71,76,81,86,91,96,101,106,111,116,121,126,131,136,141,146,151,156,161,166,171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246,251,256,261,266,271,276,281,286,291,296,301,306,311,316,321,326,331,336,341,346,351,356,361,366,371,376,381,386,391,396,401,406,411,416,421,426,431,436,441,446,451,456,461,466,471,476,481,486,491,496,501,506,511,516,521,526,531,536,541,546,551,556,561,566,571,576,581,586,591,596,601,606,611,616,621,626,631,636,641,646,651,656,661,666,671,676,681,686,691,696,701,706,711,716,721,726,731,736,741,746,751,756,761,766,771,776,781,786,791,796,801,806,811,816,821,826,831,836,841,846,851,856,861,866,871,876,881,886,891,896,901,906,911,916,921,926,931,936,941,946,951,956,961,966,971,976,981,986,991,996,1001,1006,1011,1016,1021,1026,1031,1036,1041,1046,1051,1056,1061,1066,1071,1076,1081,1086,1091,1096,1101) Query submitted at: 2016-10-17 13:16:27 (Coordinator: http: //mostafa-01:25000) Query progress can be monitored at: http: //mostafa-01:25000/query_plan?query_id=c44c3d10794c69ef:aba95b4f00000000 +----------+ | count(*) | +----------+ | 52760 | +----------+ Fetched 1 row(s) in 13.10s Query with 101 items in list finished in 8 seconds Query: select count(*) from lineitem_large where l_partkey in (1,6,11,16,21,26,31,36,41,46,51,56,61,66,71,76,81,86,91,96,101,106,111,116,121,126,131,136,141,146,151,156,161,166,171,176,181,186,191,196,201,206,211,216,221,226,231,236,241,246,251,256,261,266,271,276,281,286,291,296,301,306,311,316,321,326,331,336,341,346,351,356,361,366,371,376,381,386,391,396,401,406,411,416,421,426,431,436,441,446,451,456,461,466,471,476,481,486,491,496,501) Query submitted at: 2016-10-17 13:16:40 (Coordinator: http: //mostafa-01:25000) Query progress can be monitored at: http: //mostafa-01:25000/query_plan?query_id=874f3fe0304f35ee:62efcc200000000 +----------+ | count(*) | +----------+ | 24416 | +----------+ Fetched 1 row(s) in 7.97s
        Hide
        mmokhtar Mostafa Mokhtar added a comment -

        Tim Armstrong Alexander Behm

        Computing the value per item in the in-list is not needed when dealing with constant IN-list, only when going through the iterate code path

        template<typename T, typename SetType, bool not_in, InPredicate::Strategy strategy>
        BooleanVal InPredicate::TemplatedIn(
            FunctionContext* ctx, const T& val, int num_args, const T* args) {
          if (val.is_null) return BooleanVal::null();
        
          BooleanVal found;
          if (strategy == SET_LOOKUP) {
            SetLookupState<SetType>* state = reinterpret_cast<SetLookupState<SetType>*>(
                ctx->GetFunctionState(FunctionContext::FRAGMENT_LOCAL));
            DCHECK(state != NULL);
            found = SetLookup(state, val);
          } else {
            DCHECK_EQ(strategy, ITERATE);
            found = Iterate(ctx->GetArgType(0), val, num_args, args);
          }
          if (found.is_null) return BooleanVal::null();
          return BooleanVal(found.val ^ not_in);
        }
        
        Show
        mmokhtar Mostafa Mokhtar added a comment - Tim Armstrong Alexander Behm Computing the value per item in the in-list is not needed when dealing with constant IN-list, only when going through the iterate code path template<typename T, typename SetType, bool not_in, InPredicate::Strategy strategy> BooleanVal InPredicate::TemplatedIn( FunctionContext* ctx, const T& val, int num_args, const T* args) { if (val.is_null) return BooleanVal:: null (); BooleanVal found; if (strategy == SET_LOOKUP) { SetLookupState<SetType>* state = reinterpret_cast<SetLookupState<SetType>*>( ctx->GetFunctionState(FunctionContext::FRAGMENT_LOCAL)); DCHECK(state != NULL); found = SetLookup(state, val); } else { DCHECK_EQ(strategy, ITERATE); found = Iterate(ctx->GetArgType(0), val, num_args, args); } if (found.is_null) return BooleanVal:: null (); return BooleanVal(found.val ^ not_in); }
        Hide
        tarmstrong Tim Armstrong added a comment -

        There's something wonky here, that stuff should be optimised out of the codegen'd code. Probably needs some investigation of what's actually happening.

        Show
        tarmstrong Tim Armstrong added a comment - There's something wonky here, that stuff should be optimised out of the codegen'd code. Probably needs some investigation of what's actually happening.
        Hide
        tarmstrong Tim Armstrong added a comment -

        I took a quick look at the optimised IR and it was pretty obvious what was wrong - we're not dealing with constant varargs very well - we're generated the code to populate the varargs buffer for every row. This works ok for non-varargs since it's pretty straightforward for LLVM to eliminate the deads stores, but in thie case I don't think there's any way for it to know that the stores to the varargs buffer are dead.

        GetSlotRef.exit.i.i:                              ; preds = %get_slot.i.i.i, %50
          %is_null_phi.i.i.i = phi i1 [ false, %50 ], [ true, %get_slot.i.i.i ]
          %val_phi.i.i.i = phi i64 [ 0, %50 ], [ %val.i.i.i, %get_slot.i.i.i ]
          %lowered_arg_val_ptr3.i.i = bitcast i8* %51 to { i8, i64 }*
          store { i8, i64 } { i8 0, i64 1 }, { i8, i64 }* %lowered_arg_val_ptr3.i.i, align 8
          %arg_val_ptr5.i.i = getelementptr i8, i8* %51, i64 16
          %lowered_arg_val_ptr7.i.i = bitcast i8* %arg_val_ptr5.i.i to { i8, i64 }*
          store { i8, i64 } { i8 0, i64 6 }, { i8, i64 }* %lowered_arg_val_ptr7.i.i, align 8
          %arg_val_ptr9.i.i = getelementptr i8, i8* %51, i64 32
          %lowered_arg_val_ptr11.i.i = bitcast i8* %arg_val_ptr9.i.i to { i8, i64 }*
          store { i8, i64 } { i8 0, i64 11 }, { i8, i64 }* %lowered_arg_val_ptr11.i.i, align 8
          %arg_val_ptr13.i.i = getelementptr i8, i8* %51, i64 48
          %lowered_arg_val_ptr15.i.i = bitcast i8* %arg_val_ptr13.i.i to { i8, i64 }*
          store { i8, i64 } { i8 0, i64 16 }, { i8, i64 }* %lowered_arg_val_ptr15.i.i, align 8
          %arg_val_ptr17.i.i = getelementptr i8, i8* %51, i64 64
          %lowered_arg_val_ptr19.i.i = bitcast i8* %arg_val_ptr17.i.i to { i8, i64 }*
          store { i8, i64 } { i8 0, i64 21 }, { i8, i64 }* %lowered_arg_val_ptr19.i.i, align 8
          %arg_val_ptr21.i.i = getelementptr i8, i8* %51, i64 80
          %lowered_arg_val_ptr23.i.i = bitcast i8* %arg_val_ptr21.i.i to { i8, i64 }*
          store { i8, i64 } { i8 0, i64 26 }, { i8, i64 }* %lowered_arg_val_ptr23.i.i, align 8
          %arg_val_ptr25.i.i = getelementptr i8, i8* %51, i64 96
          %lowered_arg_val_ptr27.i.i = bitcast i8* %arg_val_ptr25.i.i to { i8, i64 }*
          store { i8, i64 } { i8 0, i64 31 }, { i8, i64 }* %lowered_arg_val_ptr27.i.i, align 8
          %arg_val_ptr29.i.i = getelementptr i8, i8* %51, i64 112
          %lowered_arg_val_ptr31.i.i = bitcast i8* %arg_val_ptr29.i.i to { i8, i64 }*
          store { i8, i64 } { i8 0, i64 36 }, { i8, i64 }* %lowered_arg_val_ptr31.i.i, align 8
          %arg_val_ptr33.i.i = getelementptr i8, i8* %51, i64 128
          %lowered_arg_val_ptr35.i.i = bitcast i8* %arg_val_ptr33.i.i to { i8, i64 }*
          store { i8, i64 } { i8 0, i64 41 }, { i8, i64 }* %lowered_arg_val_ptr35.i.i, align 8
          %arg_val_ptr37.i.i = getelementptr i8, i8* %51, i64 144
        ...
          store { i8, i64 } { i8 0, i64 2281 }, { i8, i64 }* %lowered_arg_val_ptr1827.i.i, align 8
        
        Show
        tarmstrong Tim Armstrong added a comment - I took a quick look at the optimised IR and it was pretty obvious what was wrong - we're not dealing with constant varargs very well - we're generated the code to populate the varargs buffer for every row. This works ok for non-varargs since it's pretty straightforward for LLVM to eliminate the deads stores, but in thie case I don't think there's any way for it to know that the stores to the varargs buffer are dead. GetSlotRef.exit.i.i: ; preds = %get_slot.i.i.i, %50 %is_null_phi.i.i.i = phi i1 [ false , %50 ], [ true , %get_slot.i.i.i ] %val_phi.i.i.i = phi i64 [ 0, %50 ], [ %val.i.i.i, %get_slot.i.i.i ] %lowered_arg_val_ptr3.i.i = bitcast i8* %51 to { i8, i64 }* store { i8, i64 } { i8 0, i64 1 }, { i8, i64 }* %lowered_arg_val_ptr3.i.i, align 8 %arg_val_ptr5.i.i = getelementptr i8, i8* %51, i64 16 %lowered_arg_val_ptr7.i.i = bitcast i8* %arg_val_ptr5.i.i to { i8, i64 }* store { i8, i64 } { i8 0, i64 6 }, { i8, i64 }* %lowered_arg_val_ptr7.i.i, align 8 %arg_val_ptr9.i.i = getelementptr i8, i8* %51, i64 32 %lowered_arg_val_ptr11.i.i = bitcast i8* %arg_val_ptr9.i.i to { i8, i64 }* store { i8, i64 } { i8 0, i64 11 }, { i8, i64 }* %lowered_arg_val_ptr11.i.i, align 8 %arg_val_ptr13.i.i = getelementptr i8, i8* %51, i64 48 %lowered_arg_val_ptr15.i.i = bitcast i8* %arg_val_ptr13.i.i to { i8, i64 }* store { i8, i64 } { i8 0, i64 16 }, { i8, i64 }* %lowered_arg_val_ptr15.i.i, align 8 %arg_val_ptr17.i.i = getelementptr i8, i8* %51, i64 64 %lowered_arg_val_ptr19.i.i = bitcast i8* %arg_val_ptr17.i.i to { i8, i64 }* store { i8, i64 } { i8 0, i64 21 }, { i8, i64 }* %lowered_arg_val_ptr19.i.i, align 8 %arg_val_ptr21.i.i = getelementptr i8, i8* %51, i64 80 %lowered_arg_val_ptr23.i.i = bitcast i8* %arg_val_ptr21.i.i to { i8, i64 }* store { i8, i64 } { i8 0, i64 26 }, { i8, i64 }* %lowered_arg_val_ptr23.i.i, align 8 %arg_val_ptr25.i.i = getelementptr i8, i8* %51, i64 96 %lowered_arg_val_ptr27.i.i = bitcast i8* %arg_val_ptr25.i.i to { i8, i64 }* store { i8, i64 } { i8 0, i64 31 }, { i8, i64 }* %lowered_arg_val_ptr27.i.i, align 8 %arg_val_ptr29.i.i = getelementptr i8, i8* %51, i64 112 %lowered_arg_val_ptr31.i.i = bitcast i8* %arg_val_ptr29.i.i to { i8, i64 }* store { i8, i64 } { i8 0, i64 36 }, { i8, i64 }* %lowered_arg_val_ptr31.i.i, align 8 %arg_val_ptr33.i.i = getelementptr i8, i8* %51, i64 128 %lowered_arg_val_ptr35.i.i = bitcast i8* %arg_val_ptr33.i.i to { i8, i64 }* store { i8, i64 } { i8 0, i64 41 }, { i8, i64 }* %lowered_arg_val_ptr35.i.i, align 8 %arg_val_ptr37.i.i = getelementptr i8, i8* %51, i64 144 ... store { i8, i64 } { i8 0, i64 2281 }, { i8, i64 }* %lowered_arg_val_ptr1827.i.i, align 8
        Hide
        tarmstrong Tim Armstrong added a comment -

        IMPALA-4302,IMPALA-2379: constant expr arg fixes

        This patch fixes two issues around handling of constant expr args.
        The patches are combined because they touch some of the same code
        and depend on some of the same memory management cleanup.

        First, it fixes IMPALA-2379, where constant expr args were not visible
        to UDAFs. The issue is that the input exprs need to be opened before
        calling the UDAF Init() function.

        Second, it avoids overhead from repeated evaluation of constant
        arguments for ScalarFnCall expressions on both the codegen'd and
        interpreted paths. A common example is an IN predicate with a
        long list of constant values.

        The interpreted path was inefficient because it always evaluated all
        children expressions. Instead in this patch constant args are
        evaluated once and cached. The memory management of the AnyVal*
        objects was somewhat nebulous - adjusted it so that they're allocated
        from ExprContext::mem_pool_, which has the correct lifetime.

        The codegen'd path was inefficient only with varargs - with fixed
        arguments the LLVM optimiser is able to infer after inlining that
        the expressions are constant and remove all evaluation. However,
        for varargs it stores the vararg values into a heap-allocated buffer.
        The LLVM optimiser is unable to remove these stores because they
        have a side-effect that is visible to code outside the function.

        The codegen'd path is improved by evaluating varargs into an automatic
        buffer that can be optimised out. We also make a small related change
        to bake the string constants into the codegen'd code.

        Testing:
        Ran exhaustive build.

        Added regression test for IMPALA-2379 and MemPool test for aligned
        allocation. Added a test for in predicates with constant strings.

        Perf:
        Added a targeted query that demonstrates the improvement. Also manually
        validated the non-codegend perf. Also ran TPC-H and targeted perf
        queries locally - didn't see any significant changes.

        ----------------------------------------------------------------------------------------------------------------------------------------+

        Workload Query File Format Avg(s) Base Avg(s) Delta(Avg) StdDev(%) Base StdDev(%) Num Clients Iters

        ----------------------------------------------------------------------------------------------------------------------------------------+

        TARGETED-PERF(_20) primitive_filter_in_predicate parquet / none / none 1.19 9.82 I -87.85% 3.82% 0.71% 1 10

        ----------------------------------------------------------------------------------------------------------------------------------------+

        (I) Improvement: TARGETED-PERF(_20) primitive_filter_in_predicate [parquet / none / none] (9.82s -> 1.19s [-87.85%])
        ----------------------------------------------------------------------------------------------------------+

        Operator % of Query Avg Base Avg Delta(Avg) StdDev(%) Max Base Max Delta(Max) #Hosts #Rows Est #Rows

        ----------------------------------------------------------------------------------------------------------+

        01:AGGREGATE 14.39% 155.88ms 214.61ms -27.37% 2.68% 163.38ms 227.53ms -28.19% 1 1 1
        00:SCAN HDFS 85.60% 927.46ms 9.43s -90.16% 4.49% 1.01s 9.50s -89.42% 1 13.77K 14.05K

        ----------------------------------------------------------------------------------------------------------+

        Change-Id: I45c3ed8c9d7a61e94a9b9d6c316e8a53d9ff6c24
        Reviewed-on: http://gerrit.cloudera.org:8080/4838
        Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
        Tested-by: Internal Jenkins

        Show
        tarmstrong Tim Armstrong added a comment - IMPALA-4302 , IMPALA-2379 : constant expr arg fixes This patch fixes two issues around handling of constant expr args. The patches are combined because they touch some of the same code and depend on some of the same memory management cleanup. First, it fixes IMPALA-2379 , where constant expr args were not visible to UDAFs. The issue is that the input exprs need to be opened before calling the UDAF Init() function. Second, it avoids overhead from repeated evaluation of constant arguments for ScalarFnCall expressions on both the codegen'd and interpreted paths. A common example is an IN predicate with a long list of constant values. The interpreted path was inefficient because it always evaluated all children expressions. Instead in this patch constant args are evaluated once and cached. The memory management of the AnyVal* objects was somewhat nebulous - adjusted it so that they're allocated from ExprContext::mem_pool_, which has the correct lifetime. The codegen'd path was inefficient only with varargs - with fixed arguments the LLVM optimiser is able to infer after inlining that the expressions are constant and remove all evaluation. However, for varargs it stores the vararg values into a heap-allocated buffer. The LLVM optimiser is unable to remove these stores because they have a side-effect that is visible to code outside the function. The codegen'd path is improved by evaluating varargs into an automatic buffer that can be optimised out. We also make a small related change to bake the string constants into the codegen'd code. Testing: Ran exhaustive build. Added regression test for IMPALA-2379 and MemPool test for aligned allocation. Added a test for in predicates with constant strings. Perf: Added a targeted query that demonstrates the improvement. Also manually validated the non-codegend perf. Also ran TPC-H and targeted perf queries locally - didn't see any significant changes. ------------------- ----------------------------- --------------------- ------ ----------- ---------- --------- -------------- ----------- ------+ Workload Query File Format Avg(s) Base Avg(s) Delta(Avg) StdDev(%) Base StdDev(%) Num Clients Iters ------------------- ----------------------------- --------------------- ------ ----------- ---------- --------- -------------- ----------- ------+ TARGETED-PERF(_20) primitive_filter_in_predicate parquet / none / none 1.19 9.82 I -87.85% 3.82% 0.71% 1 10 ------------------- ----------------------------- --------------------- ------ ----------- ---------- --------- -------------- ----------- ------+ (I) Improvement: TARGETED-PERF(_20) primitive_filter_in_predicate [parquet / none / none] (9.82s -> 1.19s [-87.85%] ) ------------- ---------- -------- -------- ---------- --------- -------- -------- ---------- ------ ------ ----------+ Operator % of Query Avg Base Avg Delta(Avg) StdDev(%) Max Base Max Delta(Max) #Hosts #Rows Est #Rows ------------- ---------- -------- -------- ---------- --------- -------- -------- ---------- ------ ------ ----------+ 01:AGGREGATE 14.39% 155.88ms 214.61ms -27.37% 2.68% 163.38ms 227.53ms -28.19% 1 1 1 00:SCAN HDFS 85.60% 927.46ms 9.43s -90.16% 4.49% 1.01s 9.50s -89.42% 1 13.77K 14.05K ------------- ---------- -------- -------- ---------- --------- -------- -------- ---------- ------ ------ ----------+ Change-Id: I45c3ed8c9d7a61e94a9b9d6c316e8a53d9ff6c24 Reviewed-on: http://gerrit.cloudera.org:8080/4838 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Internal Jenkins

          People

          • Assignee:
            tarmstrong Tim Armstrong
            Reporter:
            mmokhtar Mostafa Mokhtar
          • Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

            Dates

            • Created:
              Updated:
              Resolved:

              Development