Uploaded image for project: 'Apache Tez'
  1. Apache Tez
  2. TEZ-2741

Hive on Tez does not work well with Sequence Files Schema changes

    XMLWordPrintableJSON

Details

    • Bug
    • Status: Reopened
    • Major
    • Resolution: Unresolved
    • None
    • None
    • None
    • None

    Description

      hive> create external table foo (a string) partitioned by (p string) stored as sequencefile location 'hdfs:///user/hive/foo'
      
      # A useless file with some text in hdfs
      hive> create external table tmp_foo (a string) location 'hdfs:///tmp/random_data'
      hive> insert overwrite table foo partition (p = '1') select * from tmp_foo
      

      After this step, foo contains one partition with a text file.

      Now use this Java program to generate the second sequence file (but with a different key class)

      import org.apache.hadoop.conf.Configuration;
      import org.apache.hadoop.fs.Path;
      import org.apache.hadoop.io.BytesWritable;
      import org.apache.hadoop.io.LongWritable;
      import org.apache.hadoop.io.Text;
      import org.apache.hadoop.mapreduce.Job;
      import org.apache.hadoop.mapreduce.Mapper;
      import org.apache.hadoop.mapreduce.Reducer;
      import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
      import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
      
      import java.io.IOException;
      
      public class SequenceFileWriter {
        public static void main(String[] args) throws IOException,
            InterruptedException, ClassNotFoundException {
      
          Configuration conf = new Configuration();
          Job job = new Job(conf);
          job.setJobName("Convert Text");
          job.setJarByClass(Mapper.class);
      
          job.setMapperClass(Mapper.class);
          job.setReducerClass(Reducer.class);
      
          // increase if you need sorting or a special number of files
          job.setNumReduceTasks(0);
      
          job.setOutputKeyClass(LongWritable.class);
          job.setOutputValueClass(Text.class);
      
          job.setOutputFormatClass(SequenceFileOutputFormat.class);
          job.setInputFormatClass(TextInputFormat.class);
      
          TextInputFormat.addInputPath(job, new Path("/tmp/random_data"));
          SequenceFileOutputFormat.setOutputPath(job, new Path("/user/hive/foo/p=2/"));
      
          // submit and wait for completion
          job.waitForCompletion(true);
        }
      }
      

      Now run select count from foo;. It passes with MapReduce, but fails with Tez with the following error:

      hive> set hive.execution.engine=tez;
      hive> select count(*) from foo;
      
      Status: Failed
      Vertex failed, vertexName=Map 1, vertexId=vertex_1438013895843_0007_1_00, diagnostics=[Task failed, taskId=task_1438013895843_0007_1_00_000000, diagnostics=[TaskAttempt 0 failed, info=[Error: Failure while running task:java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: While processing file hdfs://localhost:9000/user/hive/foo/p=2/part-m-00000. wrong key class: org.apache.hadoop.io.BytesWritable is not class org.apache.hadoop.io.LongWritable
      	at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:171)
      	at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.run(TezProcessor.java:137)
      	at org.apache.tez.runtime.LogicalIOProcessorRuntimeTask.run(LogicalIOProcessorRuntimeTask.java:337)
      	at org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable$1.run(TezTaskRunner.java:179)
      	at org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable$1.run(TezTaskRunner.java:171)
      	at java.security.AccessController.doPrivileged(Native Method)
      	at javax.security.auth.Subject.doAs(Subject.java:415)
      	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1635)
      	at org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable.callInternal(TezTaskRunner.java:171)
      	at org.apache.tez.runtime.task.TezTaskRunner$TaskRunnerCallable.callInternal(TezTaskRunner.java:167)
      	at org.apache.tez.common.CallableWithNdc.call(CallableWithNdc.java:36)
      	at java.util.concurrent.FutureTask.run(FutureTask.java:262)
      	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
      	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
      	at java.lang.Thread.run(Thread.java:745)
      Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.io.IOException: java.io.IOException: While processing file hdfs://localhost:9000/user/hive/foo/p=2/part-m-00000. wrong key class: org.apache.hadoop.io.BytesWritable is not class org.apache.hadoop.io.LongWritable
      	at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:71)
      	at org.apache.hadoop.hive.ql.exec.tez.MapRecordProcessor.run(MapRecordProcessor.java:290)
      	at org.apache.hadoop.hive.ql.exec.tez.TezProcessor.initializeAndRunProcessor(TezProcessor.java:148)
      	... 14 more
      Caused by: java.io.IOException: java.io.IOException: While processing file hdfs://localhost:9000/user/hive/foo/p=2/part-m-00000. wrong key class: org.apache.hadoop.io.BytesWritable is not class org.apache.hadoop.io.LongWritable
      	at org.apache.hadoop.hive.io.HiveIOExceptionHandlerChain.handleRecordReaderNextException(HiveIOExceptionHandlerChain.java:121)
      	at org.apache.hadoop.hive.io.HiveIOExceptionHandlerUtil.handleRecordReaderNextException(HiveIOExceptionHandlerUtil.java:77)
      	at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:372)
      	at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:79)
      	at org.apache.hadoop.hive.ql.io.HiveRecordReader.doNext(HiveRecordReader.java:33)
      	at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.next(HiveContextAwareRecordReader.java:118)
      	at org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat$TezGroupedSplitsRecordReader.next(TezGroupedSplitsInputFormat.java:137)
      	at org.apache.tez.mapreduce.lib.MRReaderMapred.next(MRReaderMapred.java:113)
      	at org.apache.hadoop.hive.ql.exec.tez.MapRecordSource.pushRecord(MapRecordSource.java:61)
      	... 16 more
      Caused by: java.io.IOException: While processing file hdfs://localhost:9000/user/hive/foo/p=2/part-m-00000. wrong key class: org.apache.hadoop.io.BytesWritable is not class org.apache.hadoop.io.LongWritable
      	at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.handleExceptionWhenReadNext(HiveContextAwareRecordReader.java:386)
      	at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:368)
      	... 22 more
      Caused by: java.io.IOException: wrong key class: org.apache.hadoop.io.BytesWritable is not class org.apache.hadoop.io.LongWritable
      	at org.apache.hadoop.io.SequenceFile$Reader.next(SequenceFile.java:2484)
      	at org.apache.hadoop.mapred.SequenceFileRecordReader.next(SequenceFileRecordReader.java:82)
      	at org.apache.hadoop.hive.ql.io.HiveContextAwareRecordReader.doNext(HiveContextAwareRecordReader.java:359)
      	... 22 more
      

      Attachments

        1. garbled_text
          0.1 kB
          Rajat Jain
        2. TEZ-2741.1.patch
          0.8 kB
          Gopal Vijayaraghavan

        Activity

          People

            gopalv Gopal Vijayaraghavan
            rajatj Rajat Jain
            Votes:
            0 Vote for this issue
            Watchers:
            5 Start watching this issue

            Dates

              Created:
              Updated: