Uploaded image for project: 'Apache MADlib'
  1. Apache MADlib
  2. MADLIB-1218

DT and RF: categorical levels reported incorrectly when grouping is used

    XMLWordPrintableJSON

Details

    Description

      DROP TABLE IF EXISTS mt_cars;
      
      CREATE TABLE mt_cars (
          id integer NOT NULL,
          mpg double precision,
          cyl integer,
          disp double precision,
          hp integer,
          drat double precision,
          wt double precision,
          qsec double precision,
          vs integer,
          am integer,
          gear integer,
          carb integer
      );
      
      INSERT INTO mt_cars VALUES
      (1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
      (2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
      (3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
      (4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
      (5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
      (6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
      (7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
      (8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
      (9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
      (10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
      (11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
      (12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
      (13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
      (14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
      (15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
      (16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
      (17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
      (18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
      (19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
      (20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
      (21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
      (22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
      (23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
      (24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
      (25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
      (26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
      (27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
      (28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
      (29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
      (30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
      (31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
      (32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
      

      RF:

      DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, mt_cars_output_summary;
      
      SELECT madlib.forest_train('mt_cars',
                                 'mt_cars_output',
                                 'id',
                                 'mpg',
                                 '*',
                                 'id, hp, drat, am, gear, carb',  -- exclude columns
                                 'am',  -- grouping
                                 10::integer,
                                 2::integer,
                                 TRUE::boolean,
                                 1,
                                 10,
                                 8,
                                 3,
                                 10
                                 );
      SELECT * FROM mt_cars_output_summary;
      

      produces

      -[ RECORD 1 ]---------+-----------------------------------------------------------------------
      method                | forest_train
      is_classification     | f
      source_table          | mt_cars
      model_table           | mt_cars_output
      id_col_name           | id
      dependent_varname     | mpg
      independent_varnames  | vs,cyl,disp,qsec,wt
      cat_features          | vs,cyl
      con_features          | disp,qsec,wt
      grouping_cols         | am
      num_trees             | 10
      num_random_features   | 2
      max_tree_depth        | 10
      min_split             | 8
      min_bucket            | 3
      num_splits            | 10
      verbose               | f
      importance            | t
      num_permutations      | 1
      num_all_groups        | 2
      num_failed_groups     | 0
      total_rows_processed  | 32
      total_rows_skipped    | 0
      dependent_var_levels  | 
      dependent_var_type    | double precision
      independent_var_types | integer, integer, double precision, double precision, double precision
      null_proxy            | None
      
      SELECT * FROM mt_cars_output_group;
      

      produces

      -[ RECORD 1 ]------+--------------------------------------
      gid                | 1
      am                 | 0
      success            | t
      cat_n_levels       | {3,3}
      cat_levels_in_text | {4,6,8,4,6,8}
      oob_error          | 10.4384609425616
      cat_var_importance | {1.77200744569228,3.23893136930339}
      con_var_importance | {6.52806304913008,0,2.71255870662522}
      -[ RECORD 2 ]------+--------------------------------------
      gid                | 2
      am                 | 1
      success            | t
      cat_n_levels       | {2,2}
      cat_levels_in_text | {0,1,0,1}
      oob_error          | 22.3349978333876
      cat_var_importance | {4.59597959183674,0}
      con_var_importance | {5.715875,0,18.4067344897959}
      

      In the above, cat_n_levels and cat_levels_in_text are wrong. I would expect

      {0,1,4,6,8} to be reported for both groups.

      Compare the above with the case of no groups:
      DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, mt_cars_output_summary;
      
      SELECT madlib.forest_train('mt_cars',
                                 'mt_cars_output',
                                 'id',
                                 'mpg',
                                 '*',
                                 'id, hp, drat, am, gear, carb',  -- exclude columns
                                 NULL, -- no groups
                                 10::integer,
                                 2::integer,
                                 TRUE::boolean,
                                 1,
                                 10,
                                 8,
                                 3,
                                 10
                                 );
      SELECT * FROM mt_cars_output_summary;
      


      produces
      [ RECORD 1 ]---------+-----------------------------------------------------------------------
      method                | forest_train
      is_classification     | f
      source_table          | mt_cars
      model_table           | mt_cars_output
      id_col_name           | id
      dependent_varname     | mpg
      independent_varnames  | vs,cyl,disp,qsec,wt
      cat_features          | vs,cyl
      con_features          | disp,qsec,wt
      grouping_cols         | 
      num_trees             | 10
      num_random_features   | 2
      max_tree_depth        | 10
      min_split             | 8
      min_bucket            | 3
      num_splits            | 10
      verbose               | f
      importance            | t
      num_permutations      | 1
      num_all_groups        | 1
      num_failed_groups     | 0
      total_rows_processed  | 32
      total_rows_skipped    | 0
      dependent_var_levels  | 
      dependent_var_type    | double precision
      independent_var_types | integer, integer, double precision, double precision, double precision
      null_proxy            | None
      


      SELECT * FROM mt_cars_output_group;

      produces:

      gid | 1
      success | t
      cat_n_levels | {2,3}
      cat_levels_in_text | {0,1,4,6,8}

      oob_error | 9.90200731761036
      cat_var_importance |

      {0.390353827160495,12.8836927689546}

      con_var_importance |

      {0.118665925451092,0.947886524555693,19.910665401879}

      which seems OK.

      DT:

      I am seeing the same thing with decision tree. Here is a sample output grouping on 'am' from decision tree:

      madlib=# SELECT am, pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth FROM train_output;
      -[ RECORD 1 ]------+--------------
      am                 | 1
      pruning_cp         | 0
      cat_levels_in_text | {0,1,0,1}
      cat_n_levels       | {2,2}
      tree_depth         | 1
      -[ RECORD 2 ]------+--------------
      am                 | 0
      pruning_cp         | 0
      cat_levels_in_text | {4,6,8,4,6,8}
      cat_n_levels       | {3,3}
      tree_depth         | 3
      

      Attachments

        Activity

          People

            riyer Rahul Iyer
            fmcquillan Frank McQuillan
            Votes:
            0 Vote for this issue
            Watchers:
            2 Start watching this issue

            Dates

              Created:
              Updated:
              Resolved: