Uploaded image for project: 'Apache MADlib'
  1. Apache MADlib
  2. MADLIB-949

Handle multi-line list of variables better

    XMLWordPrintableJSON

Details

    • Improvement
    • Status: Open
    • Minor
    • Resolution: Unresolved
    • None
    • v3.0.0
    • All Modules
    • None

    Description

      "
      Column list input for correlation function cannot handle multi-line list of variables - see the example below where 17 columns were provided by 2 were left out of the correlation analysis due to \n being appended as a prefix to the first variable listed in a new line.

      drop view if exists rrnew.rdc_model_inputs_wallcols_gas;
      create or replace view rrnew.rdc_model_inputs_wallcols_gas as
      select *,
      ind_array[142] as ind_array_142,
      ind_array[140] as ind_array_140,
      ind_array[90] as ind_array_90,
      ind_array[96] as ind_array_96,
      ind_array[98] as ind_array_98,
      ind_array[134] as ind_array_134,
      ind_array[10] as ind_array_10,
      ind_array[8] as ind_array_8,
      ind_array[7] as ind_array_7,
      ind_array[95] as ind_array_95,
      ind_array[51] as ind_array_51,
      ind_array[139] as ind_array_139
      from rrnew.rdc_model_inputs_wallcols
      where wrldcat_mod = 'Gas Stations';

      – Executing query:
      select madlib.correlation(
      'rrnew.rdc_model_inputs_wallcols_gas',
      'rrnew.rdc_model_test_corr_gas_01',
      'dep_var, ind_array_142, ind_array_140, ind_array_90, ind_array_96, ind_array_98,
      ind_array_134, ind_array_10, ind_array_8, ind_array_7, ind_array_95, ind_array_51, ind_array_139,
      ct_sales_txns_overw, ct_returns_txns_overw, sum_sales_amt_overw, sum_returns_amt_overw',
      TRUE);
      – ""(rrnew.rdc_model_test_corr_gas_01,15,119.778332949)""
      INFO: Summary for 'correlation' function
      DETAIL:
      Columns that don't exist in 'rrnew.rdc_model_inputs_wallcols_gas' ignored: ['\nind_array_134', '\nct_sales_txns_overw']
      Producing correlation for columns: ['ind_array_7', 'ind_array_10', 'ind_array_140', 'ind_array_51', 'ind_array_142', 'ind_array_139', 'ind_array_8', 'ct_returns_txns_overw', 'ind_array_90', 'ind_array_96', 'ind_array_95', 'dep_var', 'ind_array_98', 'sum_sales_amt_overw', 'sum_returns_amt_overw']
      CONTEXT: PL/Python function ""correlation""
      NOTICE: Table doesn't have 'DISTRIBUTED BY' clause – Using column(s) named 'tot_cnt, mean' as the Greenplum Database data distribution key for this table.
      HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
      CONTEXT: SQL statement ""
      DROP TABLE IF EXISTS _madlib_temp_85896764_1449081244_37948045_;
      CREATE TABLE _madlib_temp_85896764_1449081244_37948045_ AS
      SELECT
      tot_cnt,
      count AS non_null_cnt,
      mean,
      madlib.correlation_agg(x, mean) as cor_mat
      FROM
      (
      SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
      FROM rrnew.rdc_model_inputs_wallcols_gas
      ) src1,
      (
      SELECT
      count AS tot_cnt,
      madlib.avg AS mean
      FROM
      (
      SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
      FROM rrnew.rdc_model_inputs_wallcols_gas
      ) src2
      ) subq
      WHERE NOT madlib.array_contains_null
      GROUP BY tot_cnt, mean
      ""
      PL/Python function ""correlation""
      NOTICE: table ""_madlib_temp_85896764_1449081244_37948045_"" does not exist, skipping
      CONTEXT: SQL statement ""
      DROP TABLE IF EXISTS _madlib_temp_85896764_1449081244_37948045_;
      CREATE TABLE _madlib_temp_85896764_1449081244_37948045_ AS
      SELECT
      tot_cnt,
      count AS non_null_cnt,
      mean,
      madlib.correlation_agg(x, mean) as cor_mat
      FROM
      (
      SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
      FROM rrnew.rdc_model_inputs_wallcols_gas
      ) src1,
      (
      SELECT
      count AS tot_cnt,
      madlib.avg AS mean
      FROM
      (
      SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
      FROM rrnew.rdc_model_inputs_wallcols_gas
      ) src2
      ) subq
      WHERE NOT madlib.array_contains_null
      GROUP BY tot_cnt, mean
      ""
      PL/Python function ""correlation""
      NOTICE: Table doesn't have 'DISTRIBUTED BY' clause – Using column(s) named 'method' as the Greenplum Database data distribution key for this table.
      HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
      CONTEXT: SQL statement ""
      DROP TABLE IF EXISTS rrnew.rdc_model_test_corr_gas_01_summary;
      CREATE TABLE rrnew.rdc_model_test_corr_gas_01_summary AS
      SELECT
      'correlation'::varchar AS method,
      'rrnew.rdc_model_inputs_wallcols_gas'::varchar AS source,
      'rrnew.rdc_model_test_corr_gas_01'::varchar AS output_table,
      'ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw'::varchar AS column_names,
      mean AS mean_vector,
      non_null_cnt AS total_rows_processed,
      tot_cnt - non_null_cnt AS total_rows_skipped
      FROM _madlib_temp_85896764_1449081244_37948045_
      ""
      PL/Python function ""correlation""
      NOTICE: Table doesn't have 'DISTRIBUTED BY' clause – Using column(s) named 'column_position' as the Greenplum Database data distribution key for this table.
      HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
      CONTEXT: SQL statement ""
      DROP TABLE IF EXISTS rrnew.rdc_model_test_corr_gas_01;
      CREATE TABLE rrnew.rdc_model_test_corr_gas_01 AS
      SELECT
      *
      FROM
      (
      SELECT
      generate_series(1, 15) AS column_position,
      unnest($1) AS variable
      ) variable_subq
      JOIN
      (
      SELECT
      *
      FROM
      madlib.__deconstruct_lower_triangle(
      (SELECT cor_mat FROM _madlib_temp_85896764_1449081244_37948045_)
      )
      AS deconstructed(column_position integer
      , ind_array_7 float8, ind_array_10 float8, ind_array_140 float8, ind_array_51 float8, ind_array_142 float8, ind_array_139 float8, ind_array_8 float8, ct_returns_txns_overw float8, ind_array_90 float8, ind_array_96 float8
      , ind_array_95 float8, dep_var float8, ind_array_98 float8, sum_sales_amt_overw float8, sum_returns_amt_overw float8)
      ) matrix_subq
      USING (column_position)
      ""
      PL/Python function ""correlation""
      Total query runtime: 170199 ms.
      1 row retrieved.

      select * from rrnew.rdc_model_test_corr_gas_01 order by 1;
      --Total query runtime: 831 ms.
      --15 rows retrieved.
      1;""ind_array_7"";1;;;;;;;;;;;;;;
      2;""ind_array_10"";-0.0578587333681671;1;;;;;;;;;;;;;
      3;""ind_array_140"";-0.141280063490456;-0.0298824594825621;1;;;;;;;;;;;;
      4;""ind_array_51"";0.598116457583963;-0.123774582622021;-0.165577429461016;1;;;;;;;;;;;
      5;""ind_array_142"";-0.122703307780106;0.305285848252308;-0.0453145748949762;-0.154444832472113;1;;;;;;;;;;
      6;""ind_array_139"";0.570721655517796;-0.138205302549432;-0.17841624847673;0.907429918473815;-0.167155495467681;1;;;;;;;;;
      7;""ind_array_8"";-0.0984959107166764;0.0508444910170194;0.406225489292519;-0.134253845525128;-0.0272393303189428;-0.148165608861266;1;;;;;;;;
      8;""ct_returns_txns_overw"";0.225717110532388;-0.0515613223161471;-0.0626670782890204;0.206218995895436;-0.0583236749061465;0.20466918711459;-0.053965315058231;1;;;;;;;
      9;""ind_array_90"";-0.0931577631832484;-0.019237688633164;-0.0468388474306542;-0.116091871910697;-0.0436128383781105;-0.126957508414138;-0.0245612547247926;-0.0393587080444247;1;;;;;;
      10;""ind_array_96"";-0.100053104765554;0.0485304192805481;0.407296182153894;-0.135679783095363;-0.0282837081849224;-0.148863287595196;0.925708384948968;-0.054183058576181;-0.022015894670012;1;;;;;
      11;""ind_array_95"";0.937696314043715;-0.0722795513203419;-0.148796403017105;0.600446785418866;-0.131698370371973;0.59523202625587;-0.106821466241569;0.24195998270111;-0.0967319753989314;-0.104855952107336;1;;;;
      12;""dep_var"";0.587437837717076;-0.175279017797318;-0.232814494915998;0.697484081384999;-0.2155480075257;0.761346299277184;-0.192642044082485;0.218586005606964;-0.16722483537619;-0.194369965415433;0.609343396712704;1;;;
      13;""ind_array_98"";-0.0612907301601465;0.923526436187384;-0.0295445046974965;-0.124735781736682;0.302142475697116;-0.138233539678189;0.0506370953977326;-0.0512638644969288;-0.0154606924847994;0.0580709575395514;-0.0677296211289303;-0.176583834731307;1;;
      14;""sum_sales_amt_overw"";0.718463532026027;-0.084061149182067;-0.12867635960654;0.684077740786926;-0.116794135493021;0.679447067897773;-0.100884986280351;0.236085719541536;-0.0853048449641809;-0.100468026927479;0.747788764485255;0.520951866439684;-0.0824610959973814;1;
      15;""sum_returns_amt_overw"";0.146426924665982;-0.0290252464863019;-0.0360714326537947;0.175403909662651;-0.0333782028379655;0.173672108949183;-0.030740431726783;0.586611434412563;-0.022849342897844;-0.0308901206349653;0.153035222403974;0.128197057078992;-0.028901463717606;0.236068463207792;1

      "

      Attachments

        Activity

          People

            Unassigned Unassigned
            fmcquillan Frank McQuillan
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

            Dates

              Created:
              Updated: