Uploaded image for project: 'Hive'
  1. Hive
  2. HIVE-14967

bucketing with union is broken on Tez in certain cases

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Open
    • Major
    • Resolution: Unresolved
    • None
    • None
    • None
    • None

    Description

      I am not sure if partitions are needed in the source, or destination - it may still break with non-partitioned tables. Also I don't think it's specific to tablesample, I suspect it has something to do with the bucket file not being created.
      Running the following on MiniTez

      set hive.mapred.mode=nonstrict;
      set hive.explain.user=false;
      set hive.exec.dynamic.partition.mode=nonstrict;
      set hive.fetch.task.conversion=none;
      set tez.grouping.min-size=1;
      set tez.grouping.max-size=2;
      set hive.tez.auto.reducer.parallelism=false;
      
      drop table intermediate;
      create table intermediate(key int) partitioned by (p int) stored as orc;
      insert into table intermediate partition(p='455') select distinct key from src where key >= 0 order by key desc limit 2;
      insert into table intermediate partition(p='456') select distinct key from src where key is not null order by key asc limit 2;
      
      create table bucket1_mm(key int, id int) partitioned by (key2 int)
      clustered by (key) sorted by (key) into 2 buckets;
      
      insert into table bucket1_mm partition (key2)
      select key + 1, key, key - 1 from intermediate
      union all 
      select key - 1, key, key + 1 from intermediate;
      
      select * from bucket1_mm tablesample (bucket 2 out of 2) s;
      

      Results in

      2016-10-14T13:46:39,875 ERROR [96b8187f-c8a3-4075-94ae-cb866ce293c7 main] ql.Driver: FAILED: ArrayIndexOutOfBoundsException 1
      java.lang.ArrayIndexOutOfBoundsException: 1
      	at org.apache.hadoop.hive.ql.metadata.Partition.getBucketPath(Partition.java:373)
      	at org.apache.hadoop.hive.ql.optimizer.SamplePruner.prune(SamplePruner.java:199)
      	at org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils.setMapWork(GenMapRedUtils.java:599)
      	at org.apache.hadoop.hive.ql.parse.GenTezUtils.setupMapWork(GenTezUtils.java:210)
      	at org.apache.hadoop.hive.ql.parse.GenTezUtils.createMapWork(GenTezUtils.java:189)
      	at org.apache.hadoop.hive.ql.parse.GenTezWork.process(GenTezWork.java:128)
      	at org.apache.hadoop.hive.ql.lib.CompositeProcessor.process(CompositeProcessor.java:41)
      	at org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher.dispatch(DefaultRuleDispatcher.java:90)
      	at org.apache.hadoop.hive.ql.lib.DefaultGraphWalker.dispatchAndReturn(DefaultGraphWalker.java:105)
      	at org.apache.hadoop.hive.ql.parse.GenTezWorkWalker.walk(GenTezWorkWalker.java:90)
      	at org.apache.hadoop.hive.ql.parse.GenTezWorkWalker.walk(GenTezWorkWalker.java:109)
      	at org.apache.hadoop.hive.ql.parse.GenTezWorkWalker.walk(GenTezWorkWalker.java:109)
      	at org.apache.hadoop.hive.ql.parse.GenTezWorkWalker.walk(GenTezWorkWalker.java:109)
      	at org.apache.hadoop.hive.ql.parse.GenTezWorkWalker.startWalking(GenTezWorkWalker.java:72)
      	at org.apache.hadoop.hive.ql.parse.TezCompiler.generateTaskTree(TezCompiler.java:382)
      	at org.apache.hadoop.hive.ql.parse.TaskCompiler.compile(TaskCompiler.java:270)
      	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10928)
      	at org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:260)
      	at org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:251)
      	at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:467)
      	at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:342)
      	at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1226)
      	at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1346)
      	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1143)
      	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1131)
      	at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:233)
      	at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:184)
      	at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:400)
      	at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:336)
      	at org.apache.hadoop.hive.ql.QTestUtil.executeClientInternal(QTestUtil.java:1319)
      	at org.apache.hadoop.hive.ql.QTestUtil.executeClient(QTestUtil.java:1293)
      	at org.apache.hadoop.hive.cli.control.CoreCliDriver.runTest(CoreCliDriver.java:173)
      	at org.apache.hadoop.hive.cli.control.CliAdapter.runTest(CliAdapter.java:104)
      	at org.apache.hadoop.hive.cli.TestMiniTezCliDriver.testCliDriver(TestMiniTezCliDriver.java:59)
      	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
      	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
      	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
      	at java.lang.reflect.Method.invoke(Method.java:497)
      	at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:47)
      	at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
      	at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44)
      	at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
      	at org.apache.hadoop.hive.cli.control.CliAdapter$2$1.evaluate(CliAdapter.java:92)
      	at org.junit.rules.RunRules.evaluate(RunRules.java:20)
      	at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:271)
      	at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:70)
      	at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50)
      	at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238)
      	at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63)
      	at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236)
      	at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53)
      	at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229)
      	at org.junit.runners.ParentRunner.run(ParentRunner.java:309)
      	at org.junit.runners.Suite.runChild(Suite.java:127)
      	at org.junit.runners.Suite.runChild(Suite.java:26)
      	at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238)
      	at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63)
      	at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236)
      	at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53)
      	at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229)
      	at org.apache.hadoop.hive.cli.control.CliAdapter$1$1.evaluate(CliAdapter.java:73)
      	at org.junit.rules.RunRules.evaluate(RunRules.java:20)
      	at org.junit.runners.ParentRunner.run(ParentRunner.java:309)
      	at org.apache.maven.surefire.junit4.JUnit4Provider.execute(JUnit4Provider.java:367)
      	at org.apache.maven.surefire.junit4.JUnit4Provider.executeWithRerun(JUnit4Provider.java:274)
      	at org.apache.maven.surefire.junit4.JUnit4Provider.executeTestSet(JUnit4Provider.java:238)
      	at org.apache.maven.surefire.junit4.JUnit4Provider.invoke(JUnit4Provider.java:161)
      	at org.apache.maven.surefire.booter.ForkedBooter.invokeProviderInSameClassLoader(ForkedBooter.java:290)
      	at org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:242)
      	at org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:121)
      

      Attachments

        Activity

          People

            Unassigned Unassigned
            sershe Sergey Shelukhin
            Votes:
            0 Vote for this issue
            Watchers:
            3 Start watching this issue

            Dates

              Created:
              Updated: