Details
-
Improvement
-
Status: Open
-
Minor
-
Resolution: Unresolved
-
None
-
None
Description
"
Column list input for correlation function cannot handle multi-line list of variables - see the example below where 17 columns were provided by 2 were left out of the correlation analysis due to \n being appended as a prefix to the first variable listed in a new line.
drop view if exists rrnew.rdc_model_inputs_wallcols_gas;
create or replace view rrnew.rdc_model_inputs_wallcols_gas as
select *,
ind_array[142] as ind_array_142,
ind_array[140] as ind_array_140,
ind_array[90] as ind_array_90,
ind_array[96] as ind_array_96,
ind_array[98] as ind_array_98,
ind_array[134] as ind_array_134,
ind_array[10] as ind_array_10,
ind_array[8] as ind_array_8,
ind_array[7] as ind_array_7,
ind_array[95] as ind_array_95,
ind_array[51] as ind_array_51,
ind_array[139] as ind_array_139
from rrnew.rdc_model_inputs_wallcols
where wrldcat_mod = 'Gas Stations';
– Executing query:
select madlib.correlation(
'rrnew.rdc_model_inputs_wallcols_gas',
'rrnew.rdc_model_test_corr_gas_01',
'dep_var, ind_array_142, ind_array_140, ind_array_90, ind_array_96, ind_array_98,
ind_array_134, ind_array_10, ind_array_8, ind_array_7, ind_array_95, ind_array_51, ind_array_139,
ct_sales_txns_overw, ct_returns_txns_overw, sum_sales_amt_overw, sum_returns_amt_overw',
TRUE);
– ""(rrnew.rdc_model_test_corr_gas_01,15,119.778332949)""
INFO: Summary for 'correlation' function
DETAIL:
Columns that don't exist in 'rrnew.rdc_model_inputs_wallcols_gas' ignored: ['\nind_array_134', '\nct_sales_txns_overw']
Producing correlation for columns: ['ind_array_7', 'ind_array_10', 'ind_array_140', 'ind_array_51', 'ind_array_142', 'ind_array_139', 'ind_array_8', 'ct_returns_txns_overw', 'ind_array_90', 'ind_array_96', 'ind_array_95', 'dep_var', 'ind_array_98', 'sum_sales_amt_overw', 'sum_returns_amt_overw']
CONTEXT: PL/Python function ""correlation""
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause – Using column(s) named 'tot_cnt, mean' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CONTEXT: SQL statement ""
DROP TABLE IF EXISTS _madlib_temp_85896764_1449081244_37948045_;
CREATE TABLE _madlib_temp_85896764_1449081244_37948045_ AS
SELECT
tot_cnt,
count AS non_null_cnt,
mean,
madlib.correlation_agg(x, mean) as cor_mat
FROM
(
SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
FROM rrnew.rdc_model_inputs_wallcols_gas
) src1,
(
SELECT
count AS tot_cnt,
madlib.avg AS mean
FROM
(
SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
FROM rrnew.rdc_model_inputs_wallcols_gas
) src2
) subq
WHERE NOT madlib.array_contains_null
GROUP BY tot_cnt, mean
""
PL/Python function ""correlation""
NOTICE: table ""_madlib_temp_85896764_1449081244_37948045_"" does not exist, skipping
CONTEXT: SQL statement ""
DROP TABLE IF EXISTS _madlib_temp_85896764_1449081244_37948045_;
CREATE TABLE _madlib_temp_85896764_1449081244_37948045_ AS
SELECT
tot_cnt,
count AS non_null_cnt,
mean,
madlib.correlation_agg(x, mean) as cor_mat
FROM
(
SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
FROM rrnew.rdc_model_inputs_wallcols_gas
) src1,
(
SELECT
count AS tot_cnt,
madlib.avg AS mean
FROM
(
SELECT ARRAY[ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw]::float8[] AS x
FROM rrnew.rdc_model_inputs_wallcols_gas
) src2
) subq
WHERE NOT madlib.array_contains_null
GROUP BY tot_cnt, mean
""
PL/Python function ""correlation""
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause – Using column(s) named 'method' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CONTEXT: SQL statement ""
DROP TABLE IF EXISTS rrnew.rdc_model_test_corr_gas_01_summary;
CREATE TABLE rrnew.rdc_model_test_corr_gas_01_summary AS
SELECT
'correlation'::varchar AS method,
'rrnew.rdc_model_inputs_wallcols_gas'::varchar AS source,
'rrnew.rdc_model_test_corr_gas_01'::varchar AS output_table,
'ind_array_7,ind_array_10,ind_array_140,ind_array_51,ind_array_142,ind_array_139,ind_array_8,ct_returns_txns_overw,ind_array_90,ind_array_96,ind_array_95,dep_var,ind_array_98,sum_sales_amt_overw,sum_returns_amt_overw'::varchar AS column_names,
mean AS mean_vector,
non_null_cnt AS total_rows_processed,
tot_cnt - non_null_cnt AS total_rows_skipped
FROM _madlib_temp_85896764_1449081244_37948045_
""
PL/Python function ""correlation""
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause – Using column(s) named 'column_position' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CONTEXT: SQL statement ""
DROP TABLE IF EXISTS rrnew.rdc_model_test_corr_gas_01;
CREATE TABLE rrnew.rdc_model_test_corr_gas_01 AS
SELECT
*
FROM
(
SELECT
generate_series(1, 15) AS column_position,
unnest($1) AS variable
) variable_subq
JOIN
(
SELECT
*
FROM
madlib.__deconstruct_lower_triangle(
(SELECT cor_mat FROM _madlib_temp_85896764_1449081244_37948045_)
)
AS deconstructed(column_position integer
, ind_array_7 float8, ind_array_10 float8, ind_array_140 float8, ind_array_51 float8, ind_array_142 float8, ind_array_139 float8, ind_array_8 float8, ct_returns_txns_overw float8, ind_array_90 float8, ind_array_96 float8
, ind_array_95 float8, dep_var float8, ind_array_98 float8, sum_sales_amt_overw float8, sum_returns_amt_overw float8)
) matrix_subq
USING (column_position)
""
PL/Python function ""correlation""
Total query runtime: 170199 ms.
1 row retrieved.
select * from rrnew.rdc_model_test_corr_gas_01 order by 1;
--Total query runtime: 831 ms.
--15 rows retrieved.
1;""ind_array_7"";1;;;;;;;;;;;;;;
2;""ind_array_10"";-0.0578587333681671;1;;;;;;;;;;;;;
3;""ind_array_140"";-0.141280063490456;-0.0298824594825621;1;;;;;;;;;;;;
4;""ind_array_51"";0.598116457583963;-0.123774582622021;-0.165577429461016;1;;;;;;;;;;;
5;""ind_array_142"";-0.122703307780106;0.305285848252308;-0.0453145748949762;-0.154444832472113;1;;;;;;;;;;
6;""ind_array_139"";0.570721655517796;-0.138205302549432;-0.17841624847673;0.907429918473815;-0.167155495467681;1;;;;;;;;;
7;""ind_array_8"";-0.0984959107166764;0.0508444910170194;0.406225489292519;-0.134253845525128;-0.0272393303189428;-0.148165608861266;1;;;;;;;;
8;""ct_returns_txns_overw"";0.225717110532388;-0.0515613223161471;-0.0626670782890204;0.206218995895436;-0.0583236749061465;0.20466918711459;-0.053965315058231;1;;;;;;;
9;""ind_array_90"";-0.0931577631832484;-0.019237688633164;-0.0468388474306542;-0.116091871910697;-0.0436128383781105;-0.126957508414138;-0.0245612547247926;-0.0393587080444247;1;;;;;;
10;""ind_array_96"";-0.100053104765554;0.0485304192805481;0.407296182153894;-0.135679783095363;-0.0282837081849224;-0.148863287595196;0.925708384948968;-0.054183058576181;-0.022015894670012;1;;;;;
11;""ind_array_95"";0.937696314043715;-0.0722795513203419;-0.148796403017105;0.600446785418866;-0.131698370371973;0.59523202625587;-0.106821466241569;0.24195998270111;-0.0967319753989314;-0.104855952107336;1;;;;
12;""dep_var"";0.587437837717076;-0.175279017797318;-0.232814494915998;0.697484081384999;-0.2155480075257;0.761346299277184;-0.192642044082485;0.218586005606964;-0.16722483537619;-0.194369965415433;0.609343396712704;1;;;
13;""ind_array_98"";-0.0612907301601465;0.923526436187384;-0.0295445046974965;-0.124735781736682;0.302142475697116;-0.138233539678189;0.0506370953977326;-0.0512638644969288;-0.0154606924847994;0.0580709575395514;-0.0677296211289303;-0.176583834731307;1;;
14;""sum_sales_amt_overw"";0.718463532026027;-0.084061149182067;-0.12867635960654;0.684077740786926;-0.116794135493021;0.679447067897773;-0.100884986280351;0.236085719541536;-0.0853048449641809;-0.100468026927479;0.747788764485255;0.520951866439684;-0.0824610959973814;1;
15;""sum_returns_amt_overw"";0.146426924665982;-0.0290252464863019;-0.0360714326537947;0.175403909662651;-0.0333782028379655;0.173672108949183;-0.030740431726783;0.586611434412563;-0.022849342897844;-0.0308901206349653;0.153035222403974;0.128197057078992;-0.028901463717606;0.236068463207792;1
"