Index: conf/proto-hive-site.xml =================================================================== --- conf/proto-hive-site.xml (revision 1208047) +++ conf/proto-hive-site.xml (working copy) @@ -57,7 +57,7 @@ hive.metastore.sasl.enabled - false + SASL_ENABLED If true, the metastore thrift interface will be secured with SASL. Clients must authenticate with Kerberos. @@ -81,7 +81,7 @@ hive.metastore.uris - thrift://SVRHOST:9933 + thrift://SVRHOST:PORT URI for client to contact metastore server Index: src/test/e2e/hcatalog/tools/install/install.sh =================================================================== --- src/test/e2e/hcatalog/tools/install/install.sh (revision 0) +++ src/test/e2e/hcatalog/tools/install/install.sh (revision 0) @@ -0,0 +1,212 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script assumes that it is being run from the top level directory of the +# HCatalog distribution tarball + +host="unknown" +dir="unknown" +hadoop_home="unknown" +tarball="unknown" +dbroot="unknown" +portnum="9933" +passwd="hive" +warehouseDir="/user/hive/warehouse" +sasl="false" +keytabpath="unknown" +kerberosprincipal="unknown" +forrest="unknown" + +function usage() { + echo "Usage: $0 -D dbroot -d directory -f forrest -h hadoop_home " + echo " -m host -t tarball" + echo " [-p portnum] [-P password] [-w warehouse_directory]" + echo " [-s true|false -k keytabpath -K kerberos_principal]" + echo + echo " dbroot is the root directory for the mysql drivers" + echo " directory is the directory where it will be installed" + echo " hadoop_home is the directory of your Hadoop installation." + echo " host is the machine to install the HCatalog server on" + echo " tarball is the result of running ant src-release in hcat" + echo " portnum is the port for the thrift server to use, " \ + "default $portnum" + echo " password is the password for the metastore db, default $passwd" + echo " warehouse_directory is the HDFS directory to use for " \ + "internal hive tables, default $warehouseDir" + echo " -s true will enable security, -s false turn it off, " \ + "default $sasl" + echo " keytabpath is path to Kerberos keytab file, required with " \ + "-s true" + echo " kerberos_principal service principal for thrift server, " \ + "required with -s true" + echo " All paths must be absolute" +} + +while [ "${1}x" != "x" ] ; do + if [ $1 == "-D" ] ; then + shift + dbroot=$1 + shift + elif [ $1 == "-d" ] ; then + shift + dir=$1 + shift + elif [ $1 == "-f" ] ; then + shift + forrest=$1 + shift + elif [ $1 == "-h" ] ; then + shift + hadoop_home=$1 + shift + elif [ $1 == "-K" ] ; then + shift + kerberosprincipal=$1 + kerberosprincipal=${kerberosprincipal/@/\\@} + shift + elif [ $1 == "-k" ] ; then + shift + keytabpath=$1 + shift + elif [ $1 == "-m" ] ; then + shift + host=$1 + shift + elif [ $1 == "-p" ] ; then + shift + portnum=$1 + shift + elif [ $1 == "-P" ] ; then + shift + passwd=$1 + shift + elif [ $1 == "-s" ] ; then + shift + sasl=$1 + shift + elif [ $1 == "-t" ] ; then + shift + tarball=$1 + shift + elif [ $1 == "-w" ] ; then + shift + warehouseDir=$1 + shift + else + echo "Unknown option $1" + shift + fi + +done + +for var in $forrest $dbroot $host $dir $hadoop_home $tarball ; do + if [ $var == "unknown" ] ; then + usage + exit 1 + fi +done + +# Make sure root and dbroot are absolute paths + +for var in $forrest $dbroot $dir $hadoop_home ; do + if [ ${var:0:1} != "/" ] ; then + usage + exit 1 + fi +done + +# Take the src distribution and build an installable tarball +# Copy the tarball over +rm -rf /tmp/${USER}_hcat_scratch +mkdir /tmp/${USER}_hcat_scratch +cd /tmp/${USER}_hcat_scratch +cp $tarball . +tar zxf * +dirname=`ls -1 | grep -v gz` +cd $dirname +ant -Dforrest.home=$forrest tar +tarfiledir=`pwd` +tarfilebase=`ls build/hcatalog-*.tar.gz` +tarfile="$tarfiledir/$tarfilebase" + +tfile=/tmp/${USER}_hcat_test_tarball.tgz +scp $tarfile $host:$tfile + +# Write a quick perl script to modify the hive-site.xml file +pfile=/tmp/${USER}_hcat_test_hive_site_modify.pl +cat > $pfile <) { + s!DBHOSTNAME!$host!; + s!SVRHOST!$host!; + s!PASSWORD!$passwd!; + s!WAREHOUSE_DIR!$warehouseDir!; + s!SASL_ENABLED!$sasl!; + s!KEYTAB_PATH!$keytabpath!; + s!KERBEROS_PRINCIPAL!$kerberosprincipal!; + s!PORT!$portnum!; + print; +} +! + + +# Run the install script +file=/tmp/${USER}_hcat_test_install.sh +cat > $file < $dir/etc/hcatalog/hive-site.xml +! + +scp $file $host:$file +scp $pfile $host:$pfile +ssh $host chmod +x $file +ssh $host $file +if [ $? != "0" ] ; then + echo "Failed to install hcat" + exit 1 +fi + +# Stop the current server +file=/tmp/${USER}_hcat_test_install_stop_server.sh +cat > $file < + + + + + + + + + + + + + + + + + + + + + + + + + *** Compiling UDFs *** + + + + + + + *** Creating UDF jar *** + + + + + + + + + + + + + + + + + + + + + Index: src/test/e2e/hcatalog/tools/generate/generate_data.pl =================================================================== --- src/test/e2e/hcatalog/tools/generate/generate_data.pl (revision 1208047) +++ src/test/e2e/hcatalog/tools/generate/generate_data.pl (working copy) @@ -1,26 +1,28 @@ #!/usr/bin/env perl -############################################################################ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +############################################################################ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # A utility to generate test data for pig test harness tests. # # use strict; use charnames (); +use Cwd; +use IPC::Run qw(run); our @firstName = ("alice", "bob", "calvin", "david", "ethan", "fred", "gabriella", "holly", "irene", "jessica", "katie", "luke", "mike", "nick", @@ -137,16 +139,15 @@ sub randomNameAgeGpaMap() { my $size = int(rand(3)); - my $map = "["; my @mapValues = ( "name#" . randomName(), "age#" . randomAge(), "gpa#" . randomGpa() ); $size = ($size == 0 ? 1 : $size); + my $map; for(my $i = 0; $i <= $size; $i++) { $map .= $mapValues[$i]; if($i != $size) { $map .= ","; } } - $map .= "]"; return $map; } @@ -169,47 +170,103 @@ sub randomNameAgeGpaTuple() { my $gpa = sprintf("%0.2f", randomGpa()); - return "(" . randomName() . "," . randomAge() . "," . $gpa . ")" ; + return randomName() . "," . randomAge() . "," . $gpa ; } -sub randomNameAgeGpaBag() +sub randomList() { - my $size = int(rand(int(3))); - my $bag = "{"; - $size = ($size == 0 ? 1 : $size); + my $size = int(rand(int(3))) + 1; + my $bag; for(my $i = 0; $i <= $size; $i++) { - $bag .= randomNameAgeGpaTuple(); - if($i != $size) { - $bag .= ","; - } + $bag .= randomAge(); + $bag .= "," if ($i != $size); } - $bag .= "}"; return $bag; } -our @textDoc = ( - "The cosmological proof, which we are now about to ex-", - "amine, retains the connection of absolute necessity with the", - "highest reality, but instead of reasoning, like the former proof,", - "from the highest reality to necessity of existence, it reasons", - "from the previously given unconditioned necessity of some", - "being to the unlimited reality of that being. It thus enters upon", - "a course of reasoning which, whether rational or only pseudo-", - "rational, is at any rate natural, and the most convincing not", - "only for common sense but even for speculative understand-", - "ing. It also sketches the first outline of all the proofs in natural", - "theology, an outline which has always been and always will", - "be followed, however much embellished and disguised by", - "superfluous additions. This proof, termed by Leibniz the proof", - "a contingentia mundi, we shall now proceed to expound and", - "examine."); +sub randomEscape() +{ + my $r = rand(1); + if ($r < 0.16) { + return '\"'; + } elsif ($r < 0.32) { + return '\\\\'; + } elsif ($r < 0.48) { + return '\/'; + } elsif ($r < 0.64) { + return '\n'; + } elsif ($r < 0.80) { + return '\t'; + } else { + return randomUnicodeHex(); + } +} + +sub randomJsonString() +{ + my $r = rand(1); + #if ($r < 0.05) { + # return "null"; + #} elsif ($r < 0.10) { + # return randomName() . randomEscape() . randomName(); + #} else { + return randomName(); + #} +} + +sub randomNullBoolean() +{ + my $r = rand(1); + if ($r < 0.05) { + return 'null'; + } elsif ($r < 0.525) { + return 'true'; + } else { + return 'false'; + } +} + +sub randomJsonMap() +{ + if (rand(1) < 0.05) { + return 'null'; + } + + my $str = "{"; + my $num = rand(5) + 1; + for (my $i = 0; $i < $num; $i++) { + $str .= "," unless $i == 0; + $str .= '"' . randomCity() . '" : "' . randomName() . '"'; + } + $str .= "}"; + return $str; +} + +sub randomJsonBag() +{ + if (rand(1) < 0.05) { + return 'null'; + } + + my $str = "["; + my $num = rand(5) + 1; + for (my $i = 0; $i < $num; $i++) { + $str .= "," unless $i == 0; + $str .= '{"a":' . int(rand(2**32) - 2**31) . ',"b":"' . + randomJsonString() . '"}'; + } + $str .= "]"; +} + sub usage() { - warn "Usage: $0 filetype numrows tablename targetdir [nosql]\n"; - warn "\tValid filetypes [studenttab, studentcolon, \n"; - warn "\t\tstudentnulltab, studentcomplextab, studentctrla, voternulltab\n"; - warn "\t\tvotertab, reg1459894, textdoc, unicode, manual]\n"; + warn "Usage: $0 filetype numrows tablename hdfstargetdir [format]\n"; + warn "\tValid filetypes [studenttab, studentparttab, \n"; + warn "\t\tstudentnull, allscalars, studentcomplextab, \n"; + warn "\t\tvoternulltab votertab, unicode]\n"; + warn "hdfstargetdir is the directory in hdfs that data will be copied to for loading into tables\n"; + warn "format is one of rc, csv, or json. csv is the default"; } our @greekUnicode = ("\N{U+03b1}", "\N{U+03b2}", "\N{U+03b3}", "\N{U+03b4}", @@ -226,27 +283,94 @@ return $name; } +sub randomUnicodeHex() +{ + return sprintf "\\u%04x", 0x3b1 + int(rand(25)); +} + my $testvar = "\N{U+03b1}\N{U+03b3}\N{U+03b1}\N{U+03c0}\N{U+03b7}"; -sub getBulkCopyCmd(){ - my $sourceDir= shift; - my $tableName = shift; - my $delimeter = shift; - $delimeter = '\t' if ( !$delimeter ); +sub getBulkCopyCmd($$;$) +{ + my ($tableName, $delimeter, $filename) = @_; -# . "\nCOPY $tableName FROM \'$sourceDir/$tableName' using DELIMITERS \'". '\t' . "\' WITH NULL AS '\n';"; + $filename = $tableName if (!defined($filename)); + + return "load data local infile '" . cwd . "/$filename' + into table $tableName + columns terminated by '$delimeter';" +} - my $cmd= "\nbegin transaction;" - . "\nCOPY $tableName FROM \'$sourceDir/$tableName' using DELIMITERS \'$delimeter\';" - . "\ncommit;" - . "\n"; +sub generateSecondHalfCreateTable($$$;$$$) +{ + my ($hivefp, $format, $location, $fieldDelim, $structDelim, $mapDelim) = @_; - return $cmd; + if ($format eq "csv") { + print $hivefp " +row format delimited +fields terminated by '$fieldDelim' +stored as textfile +location '$location';\n"; + } elsif ($format eq "rc") { + print $hivefp " +stored as rcfile +location '$location' +TBLPROPERTIES ( + 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', + 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' +);\n"; + } elsif ($format eq "json") { + print $hivefp " STORED AS +INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat' +INPUTDRIVER 'org.apache.hcatalog.pig.drivers.LoadFuncBasedInputDriver' OUTPUTDRIVER 'org.apache.hcatalog.pig.drivers.StoreFuncBasedOutputDriver'; +location '$location' +TBLPROPERTIES ('hcat.pig.loader'='org.apache.pig.builtin.JsonLoader', 'hcat.pig.storer'='org.apache.pig.builtin.JsonStorage', 'hcat.pig.loader.args'= +'s:chararray, i:int, d:double, m:map[chararray], bb:{t:(a:int, b:chararray)}, 'hcat.pig.args.delimiter'='\t') +;\n"; + } else { + die "Unknown format $format\n"; + } } +our $hadoopCoreJar = undef; -# main($) +sub findHadoopJars() { + if (not defined $ENV{'HADOOP_HOME'}) { + die 'Please set $HADOOP_HOME\n'; + } + + my $coreJar = `ls $ENV{'HADOOP_HOME'}/hadoop-core-*.jar`; + my $loggingJar = `ls $ENV{'HADOOP_HOME'}/lib/commons-logging-*.jar | grep -v api`; + my $cfgJar = `ls $ENV{'HADOOP_HOME'}/lib/commons-configuration-*.jar`; + my $langJar = `ls $ENV{'HADOOP_HOME'}/lib/commons-lang-*.jar`; + my $cliJar = `ls $ENV{'HADOOP_HOME'}/lib/commons-cli-*.jar`; + chomp $coreJar; + chomp $loggingJar; + chomp $cfgJar; + chomp $langJar; + chomp $cliJar; + return ($coreJar, $loggingJar, $cfgJar, $langJar, $cliJar); +} + +sub findHiveJars() +{ + if (not defined $ENV{'HIVE_HOME'}) { + die 'Please set $HIVE_HOME\n'; + } + + my $execJar = `ls ../../../../../hive/external/build/ql/hive-exec-*.jar`; + my $cliJar = `ls ../../../../../hive/external/build/cli/hive-cli-*.jar`; + chomp $execJar; + chomp $cliJar; + return ($execJar, $cliJar); +} + + + + +# main +{ # explicitly call srand so we get the same data every time # we generate it. However, we set it individually for each table type. # Otherwise we'd be generating the same data sets regardless of size, @@ -255,50 +379,121 @@ my $filetype = shift; my $numRows = shift; my $tableName = shift; - my $targetDir= shift; - my $nosql = shift; + my $hdfsTargetDir= shift; + my $format = shift; - die usage() if (!defined($filetype) || !defined($numRows)); + die usage() if (!defined($filetype) || !defined($numRows) || !defined($tableName) || !defined($hdfsTargetDir)); if ($numRows <= 0) { usage(); } - if ( $targetDir ) { - open(HDFS, "> $targetDir/$tableName") or die("Cannot open file $tableName, $!\n"); - open(PSQL, "> $targetDir/$tableName.sql") or die("Cannot open file $tableName.sql, $!\n") unless defined $nosql; - } else { - open(HDFS, "> $tableName") or die("Cannot open file $tableName, $!\n"); - open(PSQL, "> $tableName.sql") or die("Cannot open file $tableName.sql, $!\n") unless defined $nosql; + $format = "csv" if not defined $format; + + if ($format eq "csv") { + open(HDFS, "> $tableName") or die("Cannot open file $tableName, $!\n"); } + open(MYSQL, "> $tableName.mysql.sql") or + die("Cannot open file $tableName.mysql.sql, $!\n"); + open(my $hivefp, "> $tableName.hcat.sql") or + die("Cannot open file $tableName.hive.sql, $!\n"); - if ($filetype eq "manual") { - } elsif ($filetype eq "studenttab") { + if ($filetype eq "studenttab") { srand(3.14159 + $numRows); - print PSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n" unless defined $nosql; - print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined $nosql; - for (my $i = 0; $i < $numRows; $i++) { - my $name = randomName(); - my $age = randomAge(); - my $gpa = randomGpa(); - printf HDFS "%s\t%d\t%.2f\n", $name, $age, $gpa; + print MYSQL "drop table if exists $tableName;\n"; + print MYSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n"; + print $hivefp "drop table if exists $tableName;\ncreate external table $tableName( + name string, + age int, + gpa double)"; + + generateSecondHalfCreateTable($hivefp, $format, + "$hdfsTargetDir/$tableName", '\\t'); + if ($format eq "csv") { + print MYSQL &getBulkCopyCmd($tableName, "\t", "$tableName"); + for (my $i = 0; $i < $numRows; $i++) { + my $name = randomName(); + my $age = randomAge(); + my $gpa = randomGpa(); + printf HDFS "%s\t%d\t%.2f\n", $name, $age, $gpa; + } + } elsif ($format eq "rc") { + print MYSQL &getBulkCopyCmd($tableName, "\t", "$tableName.plain"); + my ($hadoopCoreJar, $commonsLoggingJar, $commonsConfigJar, + $commonsLangJar, $commonsCliJar) = findHadoopJars(); + my ($hiveExecJar, $hiveCliJar) = findHiveJars(); + my @cmd = ('java', '-cp', + "../tools/generate/java/hive-gen.jar:$hadoopCoreJar:" . + "$commonsLoggingJar:$commonsConfigJar:$commonsLangJar:" . + "$hiveExecJar", + 'org.apache.hadoop.hive.tools.generate.RCFileGenerator', + 'student', $numRows, "$tableName", "$tableName.plain"); + run(\@cmd) or die "Unable to run command [" . join(" ", @cmd) + . "]\n"; + #@cmd = ('java', '-cp', + # "$hiveCliJar:$hiveExecJar:$hadoopCoreJar:" . + # "$commonsLoggingJar:$commonsCliJar:$commonsConfigJar", + # "org.apache.hadoop.hive.cli.RCFileCat", "$tableName"); + #run(\@cmd, '>', $tableName) or + # die "Unable to run command [" . join(" ", @cmd) . "]\n"; + } else { + die "Unknown format $format\n"; } + } elsif ($filetype eq "studentparttab") { + srand(3.14159 + $numRows); + print MYSQL "drop table if exists $tableName;\n"; + print MYSQL "create table $tableName (name varchar(100), age integer, gpa float(3), ds char(8));\n"; + print MYSQL &getBulkCopyCmd($tableName, "\t", "$tableName.mysql"); + print $hivefp "drop table if exists $tableName;\ncreate external table $tableName( + name string, + age int, + gpa double) + partitioned by (ds string) + row format delimited + fields terminated by '\\t' + stored as textfile + location '$hdfsTargetDir/$tableName'; + alter table $tableName add IF NOT EXISTS partition (ds='20110924') location '$hdfsTargetDir/$tableName/$tableName.20110924'; + alter table $tableName add IF NOT EXISTS partition (ds='20110925') location '$hdfsTargetDir/$tableName/$tableName.20110925'; + alter table $tableName add IF NOT EXISTS partition (ds='20110926') location '$hdfsTargetDir/$tableName/$tableName.20110926'; + "; + open(MYSQLDATA, "> $tableName.mysql") or die("Cannot open file $tableName.mysql, $!\n"); + for (my $ds = 20110924; $ds < 20110927; $ds++) { + close(HDFS); + open(HDFS, "> $tableName.$ds") or die("Cannot open file $tableName.$ds, $!\n"); + for (my $i = 0; $i < $numRows; $i++) { + my $name = randomName(); + my $age = randomAge(); + my $gpa = randomGpa(); + printf HDFS "%s\t%d\t%.2f\n", $name, $age, $gpa; + printf MYSQLDATA "%s\t%d\t%.3f\t%d\n", $name, $age, $gpa, $ds; + } + } + close(MYSQLDATA); - } elsif ($filetype eq "studentnulltab") { + } elsif ($filetype eq "studentnull") { srand(3.14159 + $numRows); - print PSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n"; - print PSQL "begin transaction;\n"; + print MYSQL "drop table if exists $tableName;\n"; + print MYSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n"; + print $hivefp "drop table if exists $tableName;\ncreate external table $tableName( + name string, + age int, + gpa double) + row format delimited + fields terminated by '\\001' + stored as textfile + location '$hdfsTargetDir/$tableName';\n"; for (my $i = 0; $i < $numRows; $i++) { # generate nulls in a random fashion my $name = rand(1) < 0.05 ? '' : randomName(); my $age = rand(1) < 0.05 ? '' : randomAge(); my $gpa = rand(1) < 0.05 ? '' : randomGpa(); - printf PSQL "insert into $tableName (name, age, gpa) values("; - print PSQL ($name eq ''? "null, " : "'$name', "), ($age eq ''? "null, " : "$age, "); + printf MYSQL "insert into $tableName (name, age, gpa) values("; + print MYSQL ($name eq ''? "null, " : "'$name', "), ($age eq ''? "null, " : "$age, "); if($gpa eq '') { - print PSQL "null);\n" + print MYSQL "null);\n" } else { - printf PSQL "%.2f);\n", $gpa; + printf MYSQL "%.2f);\n", $gpa; } - print HDFS "$name\t$age\t"; + print HDFS "$name$age"; if($gpa eq '') { print HDFS "\n" } else { @@ -306,69 +501,65 @@ } } - print PSQL "commit;\n" unless defined $nosql; + print MYSQL "commit;\n"; - } elsif ($filetype eq "studentcolon") { + } elsif ($filetype eq "allscalars") { srand(2.718281828459 + $numRows); - print PSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n" unless defined $nosql; - print PSQL &getBulkCopyCmd( $targetDir, $tableName, ':' ) unless defined $nosql; + print MYSQL "drop table if exists $tableName;\n"; + print MYSQL "create table $tableName (t tinyint, si smallint, i int, b + bigint, f double, d double, s varchar(25));\n"; + print MYSQL &getBulkCopyCmd($tableName, ':'); + print $hivefp "drop table if exists $tableName;\ncreate external table $tableName( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + s string) + row format delimited + fields terminated by ':' + stored as textfile + location '$hdfsTargetDir/$tableName';\n + alter table $tableName set TBLPROPERTIES + ('hcat.pig.loader.args'=':', 'hcat.pig.storer.args'=':');\n"; for (my $i = 0; $i < $numRows; $i++) { - my $name = randomName(); - my $age = randomAge(); - my $gpa = randomGpa(); - printf HDFS "%s:%d:%.2f\n", $name, $age, $gpa; -=begin - } elsif ($filetype eq "studentusrdef") { - srand(6.62606896 + $numRows); - for (my $i = 0; $i < $numRows; $i++) { - # TODO need to add SQL info. - printf("%s,%d,%.2f,", randomName(), randomAge(), randomGpa()); - printf("<%s,%s,%s,%d>,", randomStreet(), randomCity(), randomState(), - randomZip()); - printf("[%s:<%s,%s>],", randomClass(), randomClass(), randomName()); - printf("{"); - my $elementsInBag = int(rand(100)); - for (my $j = 0; $j < $elementsInBag; $j++) { - if ($j != 0) { printf(","); } - printf("<%s,%s,%s>", randomClass(), randomName(), randomGrade()); - } - printf("}\n"); + printf HDFS "%d:%d:%d:%ld:%.2f:%.2f:%s\n", + (int(rand(2**8) - 2**7)), + (int(rand(2**16) - 2**15)), + (int(rand(2**32) - 2**31)), + (int(rand(2**64) - 2**61)), + rand(100000.0) - 50000.0, + rand(10000000.0) - 5000000.0, + randomName(); } -=cut - } - print PSQL "commit;\n" unless defined $nosql; - - } elsif ($filetype eq "studentctrla") { - srand(6.14159 + $numRows); - print PSQL "create table $tableName (name varchar(100), age integer, gpa float(3));\n"; - print PSQL "begin transaction;\n"; - for (my $i = 0; $i < $numRows; $i++) { - my $name = randomName(); - my $age = randomAge(); - my $gpa = randomGpa(); - printf PSQL "insert into $tableName (name, age, gpa) values('%s', %d, %.2f);\n", - $name, $age, $gpa; - printf HDFS "%s%d%.2f\n", $name, $age, $gpa; - } - print PSQL "commit;\n" unless defined $nosql; - - } elsif ($filetype eq "studentcomplextab") { srand(3.14159 + $numRows); - print PSQL "create table $tableName (nameagegpamap varchar(500), nameagegpatuple varchar(500), nameagegpabag varchar(500), nameagegpamap_name varchar(500), nameagegpamap_age integer, nameagegpamap_gpa float(3));\n"; - print PSQL "begin transaction;\n"; + print MYSQL "drop table if exists $tableName;\n"; + print MYSQL "create table $tableName (nameagegpamap varchar(500), nameagegpatuple varchar(500), nameagegpabag varchar(500), nameagegpamap_name varchar(500), nameagegpamap_age integer, nameagegpamap_gpa float(3));\n"; + print MYSQL "begin transaction;\n"; + print $hivefp "drop table if exists $tableName;\ncreate external table $tableName( + nameagegpamap map, + struct , + array ) + row format delimited + fields terminated by '\\t' + collection items terminated by ',' + map keys terminated by '#' + stored as textfile + location '$hdfsTargetDir/$tableName';\n"; for (my $i = 0; $i < $numRows; $i++) { # generate nulls in a random fashion my $map = rand(1) < 0.05 ? '' : randomNameAgeGpaMap(); my $tuple = rand(1) < 0.05 ? '' : randomNameAgeGpaTuple(); - my $bag = rand(1) < 0.05 ? '' : randomNameAgeGpaBag(); - printf PSQL "insert into $tableName (nameagegpamap, nameagegpatuple, nameagegpabag, nameagegpamap_name, nameagegpamap_age, nameagegpamap_gpa) values("; + my $bag = rand(1) < 0.05 ? '' : randomList(); + printf MYSQL "insert into $tableName (nameagegpamap, nameagegpatuple, nameagegpabag, nameagegpamap_name, nameagegpamap_age, nameagegpamap_gpa) values("; my $mapHash; if($map ne '') { $mapHash = getMapFields($map); } - print PSQL ($map eq ''? "null, " : "'$map', "), + print MYSQL ($map eq ''? "null, " : "'$map', "), ($tuple eq ''? "null, " : "'$tuple', "), ($bag eq '' ? "null, " : "'$bag', "), ($map eq '' ? "null, " : (exists($mapHash->{'name'}) ? "'".$mapHash->{'name'}."', " : "null, ")), @@ -376,13 +567,23 @@ ($map eq '' ? "null);\n" : (exists($mapHash->{'gpa'}) ? "'".$mapHash->{'gpa'}."');\n" : "null);\n")); print HDFS "$map\t$tuple\t$bag\n"; } - print PSQL "commit;\n" unless defined $nosql; + print MYSQL "commit;\n"; } elsif ($filetype eq "votertab") { srand(299792458 + $numRows); - print PSQL "create table $tableName (name varchar(100), age integer, registration varchar(20), contributions float);\n" unless defined $nosql; - print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined $nosql; - for (my $i = 0; $i < $numRows; $i++) { + print MYSQL "drop table if exists $tableName;\n"; + print MYSQL "create table $tableName (name varchar(100), age integer, registration varchar(20), contributions float);\n"; + print MYSQL &getBulkCopyCmd($tableName, "\t"); + print $hivefp "drop table if exists $tableName;\ncreate external table $tableName( + name string, + age int, + registration string, + contributions float) + row format delimited + fields terminated by '\\t' + stored as textfile + location '$hdfsTargetDir/$tableName';\n"; +for (my $i = 0; $i < $numRows; $i++) { my $name = randomName(); my $age = randomAge(); my $registration = randomRegistration(); @@ -393,22 +594,32 @@ } elsif ($filetype eq "voternulltab") { srand(299792458 + $numRows); - print PSQL "create table $tableName (name varchar(100), age integer, registration varchar(20), contributions float);\n" unless defined $nosql; - print PSQL "begin transaction;\n" unless defined $nosql; + print MYSQL "drop table if exists $tableName;\n"; + print MYSQL "create table $tableName (name varchar(100), age integer, registration varchar(20), contributions float);\n"; + print MYSQL "begin transaction;\n"; + print $hivefp "drop table if exists $tableName;\ncreate external table $tableName( + name string, + age int, + registration string, + contributions float) + row format delimited + fields terminated by '\\t' + stored as textfile + location '$hdfsTargetDir/$tableName';\n"; for (my $i = 0; $i < $numRows; $i++) { # generate nulls in a random fashion my $name = rand(1) < 0.05 ? '' : randomName(); my $age = rand(1) < 0.05 ? '' : randomAge(); my $registration = rand(1) < 0.05 ? '' : randomRegistration(); my $contribution = rand(1) < 0.05 ? '' : randomContribution(); - printf PSQL "insert into $tableName (name, age, registration, contributions) values("; - print PSQL ($name eq ''? "null, " : "'$name', "), + printf MYSQL "insert into $tableName (name, age, registration, contributions) values("; + print MYSQL ($name eq ''? "null, " : "'$name', "), ($age eq ''? "null, " : "$age, "), ($registration eq ''? "null, " : "'$registration', "); if($contribution eq '') { - print PSQL "null);\n" + print MYSQL "null);\n" } else { - printf PSQL "%.2f);\n", $contribution; + printf MYSQL "%.2f);\n", $contribution; } print HDFS "$name\t$age\t$registration\t"; if($contribution eq '') { @@ -417,43 +628,65 @@ printf HDFS "%.2f\n", $contribution; } } - print PSQL "commit;\n" unless defined $nosql; + print MYSQL "commit;\n"; - } elsif ($filetype eq "reg1459894") { - srand(6.67428 + $numRows); - print PSQL "create table $tableName (first varchar(10), second varchar(10));\n" unless defined $nosql; - print PSQL &getBulkCopyCmd( $targetDir, $tableName ) unless defined $nosql; - for (my $i = 0; $i < $numRows; $i++) { - my $letter = randomNumLetter(); - my $gkLetter = randomGreekLetter(); - printf HDFS "%s\t%s\n", $letter, $gkLetter; - } - - } elsif ($filetype eq "textdoc") { - # This one ignores the number of lines. It isn't random either. - print PSQL "create table $tableName (name varchar(255));\n" unless defined $nosql; - print PSQL "begin transaction;\n" unless defined $nosql; - for (my $i = 0; $i < @textDoc; $i++) { - my $sqlWords = $textDoc[$i]; - $sqlWords =~ s/([\w-]+)/$1,/g; - print PSQL "insert into $tableName (name) values('($sqlWords)');\n" unless defined $nosql; - print HDFS "$textDoc[$i]\n"; - } - print PSQL "commit;\n" unless defined $nosql; - - } elsif ($filetype eq "unicode") { srand(1.41421 + $numRows); - print PSQL "create table $tableName (name varchar(255));\n" unless defined $nosql; - print PSQL "begin transaction;\n" unless defined $nosql; + print MYSQL "drop table if exists $tableName;\n"; + print MYSQL "create table $tableName (name varchar(255));\n"; + print MYSQL "begin transaction;\n"; + print $hivefp "drop table if exists $tableName;\ncreate external table $tableName( + name string) + row format delimited + fields terminated by '\\t' + stored as textfile + location '$hdfsTargetDir/$tableName';\n"; for (my $i = 0; $i < $numRows; $i++) { my $name = randomUnicodeNonAscii(); - printf PSQL "insert into $tableName (name) values('%s');\n", - $name unless defined $nosql; + printf MYSQL "insert into $tableName (name) values('%s');\n", $name; printf HDFS "%s\n", $name; } - print PSQL "commit;\n" unless defined $nosql; + print MYSQL "commit;\n"; + } elsif ($filetype eq "json") { + srand(6.0221415 + $numRows); + print MYSQL "drop table if exists $tableName;"; + print MYSQL "create table $tableName( + s varchar(100), + i int, + d double);"; + print MYSQL &getBulkCopyCmd($tableName, "\t", "$tableName.plain"); + print $hivefp "drop table if exists $tableName;\ncreate external table $tableName( + s string, + i int, + d double, + m map, + bb array>) + STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat' + INPUTDRIVER 'org.apache.hcatalog.pig.drivers.LoadFuncBasedInputDriver' OUTPUTDRIVER 'org.apache.hcatalog.pig.drivers.StoreFuncBasedOutputDriver' + location '$hdfsTargetDir/$tableName' + TBLPROPERTIES ('hcat.pig.loader'='org.apache.pig.builtin.JsonLoader', 'hcat.pig.storer'='org.apache.pig.builtin.JsonStorage', 'hcat.pig.loader.args'= +'s:chararray, i:int, d:double, m:map[chararray], bb:{t:(a:int, b:chararray)}', 'hcat.pig.args.delimiter'='\t');\n"; + open(PLAIN, ">$tableName.plain") or + die("Cannot open file $tableName.hive.sql, $!\n"); + for (my $i = 0; $i < $numRows; $i++) { + my $s = randomJsonString(); + my $i = int(rand(2**32) - 2**31), + my $d = rand(2**10) - 2**9, +# my $i = rand(1) < 0.05 ? 'null' : (int(rand(2**32) - 2**31)), +# my $d = rand(1) < 0.05 ? 'null' : (rand(2**10) - 2**9), + my $m = randomJsonMap(); + my $bb = randomJsonBag(); +# printf MYSQL "insert into $tableName (name) values('%s');\n", $name; + print HDFS qq@{"s":"$s", "i":$i, "d":$d, "m":$m, "bb":$bb}\n@; + if ($s eq 'null') { + $s=""; + } + print PLAIN "$s\t$i\t$d\n"; + } + close PLAIN; + print MYSQL "commit;\n"; + } else { warn "Unknown filetype $filetype\n"; usage(); Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/WriteText.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/WriteText.java (revision 0) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/WriteText.java (revision 0) @@ -0,0 +1,130 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hcatalog.utils; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hcatalog.common.HCatConstants; +import org.apache.hcatalog.data.DefaultHCatRecord; +import org.apache.hcatalog.data.HCatRecord; +import org.apache.hcatalog.data.schema.HCatSchema; +import org.apache.hcatalog.mapreduce.HCatInputFormat; +import org.apache.hcatalog.mapreduce.HCatOutputFormat; +import org.apache.hcatalog.mapreduce.InputJobInfo; +import org.apache.hcatalog.mapreduce.OutputJobInfo; + +/** + * This is a map reduce test for testing hcat which goes against the "numbers" + * table. It performs a group by on the first column and a SUM operation on the + * other columns. This is to simulate a typical operation in a map reduce + * program to test that hcat hands the right data to the map reduce program + * + * Usage: hadoop jar sumnumbers <-libjars hive-hcat + * jar> The argument controls the output delimiter The hcat jar + * location should be specified as file:// + */ +public class WriteText extends Configured implements Tool { + + public static class Map extends + Mapper { + + int t; + int si; + int i; + long b; + float f; + double d; + String s; + + @Override + protected void map( + WritableComparable key, + HCatRecord value, + org.apache.hadoop.mapreduce.Mapper.Context context) + throws IOException, InterruptedException { + t = (Integer)value.get(0); + si = (Integer)value.get(1); + i = (Integer)value.get(2); + b = (Long)value.get(3); + f = (Float)value.get(4); + d = (Double)value.get(5); + s = (String)value.get(6); + + HCatRecord record = new DefaultHCatRecord(7); + record.set(0, t); + record.set(1, si); + record.set(2, i); + record.set(3, b); + record.set(4, f); + record.set(5, d); + record.set(6, s); + + context.write(null, record); + + } + } + + public int run(String[] args) throws Exception { + Configuration conf = getConf(); + args = new GenericOptionsParser(conf, args).getRemainingArgs(); + + String serverUri = args[0]; + String inputTableName = args[1]; + String outputTableName = args[2]; + String dbName = null; + + String principalID = System + .getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); + if (principalID != null) + conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); + Job job = new Job(conf, "WriteText"); + HCatInputFormat.setInput(job, InputJobInfo.create(dbName, + inputTableName, null, serverUri, principalID)); + // initialize HCatOutputFormat + + job.setInputFormatClass(HCatInputFormat.class); + job.setJarByClass(WriteText.class); + job.setMapperClass(Map.class); + job.setOutputKeyClass(WritableComparable.class); + job.setOutputValueClass(DefaultHCatRecord.class); + job.setNumReduceTasks(0); + HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, + outputTableName, null, serverUri, principalID)); + HCatSchema s = HCatInputFormat.getTableSchema(job); + System.err.println("INFO: output schema explicitly set for writing:" + + s); + HCatOutputFormat.setSchema(job, s); + job.setOutputFormatClass(HCatOutputFormat.class); + return (job.waitForCompletion(true) ? 0 : 1); + } + + public static void main(String[] args) throws Exception { + int exitCode = ToolRunner.run(new WriteText(), args); + System.exit(exitCode); + } +} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadRC.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadRC.java (revision 0) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadRC.java (revision 0) @@ -0,0 +1,114 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hcatalog.utils; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hcatalog.common.HCatConstants; +import org.apache.hcatalog.data.DefaultHCatRecord; +import org.apache.hcatalog.data.HCatRecord; +import org.apache.hcatalog.mapreduce.HCatInputFormat; +import org.apache.hcatalog.mapreduce.InputJobInfo; +import org.apache.pig.data.DataBag; + +/** + * This is a map reduce test for testing hcat which goes against the "numbers" + * table. It performs a group by on the first column and a SUM operation on the + * other columns. This is to simulate a typical operation in a map reduce program + * to test that hcat hands the right data to the map reduce program + * + * Usage: hadoop jar sumnumbers <-libjars hive-hcat jar> + The argument controls the output delimiter + The hcat jar location should be specified as file:// + */ +public class ReadRC extends Configured implements Tool { + + public static class Map + extends Mapper{ + + String name; + int age; + double gpa; + + @Override + protected void map(WritableComparable key, HCatRecord value, + org.apache.hadoop.mapreduce.Mapper.Context context) + throws IOException ,InterruptedException { + name = (String)value.get(0); + age = (Integer)value.get(1); + gpa = (Double)value.get(2); + + HCatRecord record = new DefaultHCatRecord(3); + record.set(0, name); + record.set(1, age); + record.set(2, gpa); + + context.write(null, record); + + } + } + + public int run(String[] args) throws Exception { + Configuration conf = getConf(); + args = new GenericOptionsParser(conf, args).getRemainingArgs(); + + String serverUri = args[0]; + String tableName = args[1]; + String outputDir = args[2]; + String dbName = null; + + String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); + if(principalID != null) + conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); + Job job = new Job(conf, "ReadRC"); + HCatInputFormat.setInput(job, InputJobInfo.create( + dbName, tableName, null, serverUri, principalID)); + // initialize HCatOutputFormat + + job.setInputFormatClass(HCatInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + job.setJarByClass(ReadRC.class); + job.setMapperClass(Map.class); + job.setOutputKeyClass(IntWritable.class); + job.setOutputValueClass(HCatRecord.class); + job.setNumReduceTasks(0); + FileOutputFormat.setOutputPath(job, new Path(outputDir)); + return (job.waitForCompletion(true) ? 0 : 1); + } + + public static void main(String[] args) throws Exception { + int exitCode = ToolRunner.run(new ReadRC(), args); + System.exit(exitCode); + } +} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/SumNumbers.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/SumNumbers.java (revision 1208047) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/SumNumbers.java (working copy) @@ -1,257 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hcatalog.utils; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.DoubleWritable; -import org.apache.hadoop.io.FloatWritable; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.hcatalog.common.HCatConstants; -import org.apache.hcatalog.data.HCatRecord; -import org.apache.hcatalog.mapreduce.HCatInputFormat; -import org.apache.hcatalog.mapreduce.InputJobInfo; - -/** - * This is a map reduce test for testing hcat which goes against the "numbers" - * table. It performs a group by on the first column and a SUM operation on the - * other columns. This is to simulate a typical operation in a map reduce program - * to test that hcat hands the right data to the map reduce program - * - * Usage: hadoop jar sumnumbers <-libjars hive-hcat jar> - The argument controls the output delimiter - The hcat jar location should be specified as file:// - */ -public class SumNumbers { - - private static final String NUMBERS_TABLE_NAME = "numbers"; - private static final String TAB = "\t"; - - public static class SumMapper - extends Mapper{ - - IntWritable intnum1000; - // though id is given as a Short by hcat, the map will emit it as an - // IntWritable so we can just sum in the reduce - IntWritable id; - - // though intnum5 is handed as a Byte by hcat, the map() will emit it as - // an IntWritable so we can just sum in the reduce - IntWritable intnum5; - IntWritable intnum100; - IntWritable intnum; - LongWritable longnum; - FloatWritable floatnum; - DoubleWritable doublenum; - @Override - protected void map(WritableComparable key, HCatRecord value, - org.apache.hadoop.mapreduce.Mapper.Context context) - throws IOException ,InterruptedException { - intnum1000 = new IntWritable((Integer)value.get(0)); - id = new IntWritable((Short) value.get(1)); - intnum5 = new IntWritable(((Byte)value.get(2))); - intnum100 = new IntWritable(((Integer) value.get(3))); - intnum = new IntWritable((Integer) value.get(4)); - longnum = new LongWritable((Long) value.get(5)); - floatnum = new FloatWritable((Float) value.get(6)); - doublenum = new DoubleWritable((Double) value.get(7)); - SumNumbers.ArrayWritable outputValue = new SumNumbers.ArrayWritable(id, - intnum5, intnum100, intnum, longnum, floatnum, doublenum); - context.write(intnum1000, outputValue); - - } - } - - public static class SumReducer extends Reducer { - - - LongWritable dummyLong = null; - @Override - protected void reduce(IntWritable key, java.lang.Iterable - values, org.apache.hadoop.mapreduce.Reducer.Context context) - throws IOException ,InterruptedException { - String output = key.toString() + TAB; - Long sumid = 0l; - Long sumintnum5 = 0l; - Long sumintnum100 = 0l; - Long sumintnum = 0l; - Long sumlongnum = 0l; - Float sumfloatnum = 0.0f; - Double sumdoublenum = 0.0; - for (ArrayWritable value : values) { - sumid += value.id.get(); - sumintnum5 += value.intnum5.get(); - sumintnum100 += value.intnum100.get(); - sumintnum += value.intnum.get(); - sumlongnum += value.longnum.get(); - sumfloatnum += value.floatnum.get(); - sumdoublenum += value.doublenum.get(); - } - output += sumid + TAB; - output += sumintnum5 + TAB; - output += sumintnum100 + TAB; - output += sumintnum + TAB; - output += sumlongnum + TAB; - output += sumfloatnum + TAB; - output += sumdoublenum + TAB; - context.write(dummyLong, new Text(output)); - } - } - - public static void main(String[] args) throws Exception { - Configuration conf = new Configuration(); - args = new GenericOptionsParser(conf, args).getRemainingArgs(); - String[] otherArgs = new String[4]; - int j = 0; - for(int i = 0; i < args.length; i++) { - if(args[i].equals("-libjars")) { - // generic options parser doesn't seem to work! - conf.set("tmpjars", args[i+1]); - i = i+1; // skip it , the for loop will skip its value - } else { - otherArgs[j++] = args[i]; - } - } - if (otherArgs.length != 4) { - System.err.println("Usage: hadoop jar sumnumbers <-libjars hive-hcat jar>\n" + - "The argument controls the output delimiter.\n" + - "The hcat jar location should be specified as file://\n"); - System.exit(2); - } - String serverUri = otherArgs[0]; - String tableName = NUMBERS_TABLE_NAME; - String outputDir = otherArgs[1]; - String dbName = "default"; - - String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); - if(principalID != null) - conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); - Job job = new Job(conf, "sumnumbers"); - HCatInputFormat.setInput(job, InputJobInfo.create( - dbName, tableName, null, serverUri, principalID)); - // initialize HCatOutputFormat - - job.setInputFormatClass(HCatInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - job.setJarByClass(SumNumbers.class); - job.setMapperClass(SumMapper.class); - job.setReducerClass(SumReducer.class); - job.setMapOutputKeyClass(IntWritable.class); - job.setMapOutputValueClass(ArrayWritable.class); - job.setOutputKeyClass(LongWritable.class); - job.setOutputValueClass(Text.class); - FileOutputFormat.setOutputPath(job, new Path(outputDir)); - System.exit(job.waitForCompletion(true) ? 0 : 1); - } - - public static class ArrayWritable implements Writable { - - // though id is given as a Short by hcat, the map will emit it as an - // IntWritable so we can just sum in the reduce - IntWritable id; - - // though intnum5 is handed as a Byte by hcat, the map() will emit it as - // an IntWritable so we can just sum in the reduce - IntWritable intnum5; - - IntWritable intnum100; - IntWritable intnum; - LongWritable longnum; - FloatWritable floatnum; - DoubleWritable doublenum; - - /** - * - */ - public ArrayWritable() { - id = new IntWritable(); - intnum5 = new IntWritable(); - intnum100 = new IntWritable(); - intnum = new IntWritable(); - longnum = new LongWritable(); - floatnum = new FloatWritable(); - doublenum = new DoubleWritable(); - } - - - - /** - * @param id - * @param intnum5 - * @param intnum100 - * @param intnum - * @param longnum - * @param floatnum - * @param doublenum - */ - public ArrayWritable(IntWritable id, IntWritable intnum5, - IntWritable intnum100, IntWritable intnum, LongWritable longnum, - FloatWritable floatnum, DoubleWritable doublenum) { - this.id = id; - this.intnum5 = intnum5; - this.intnum100 = intnum100; - this.intnum = intnum; - this.longnum = longnum; - this.floatnum = floatnum; - this.doublenum = doublenum; - } - - - - @Override - public void readFields(DataInput in) throws IOException { - id.readFields(in); - intnum5.readFields(in); - intnum100.readFields(in); - intnum.readFields(in); - longnum.readFields(in); - floatnum.readFields(in); - doublenum.readFields(in); - } - - @Override - public void write(DataOutput out) throws IOException { - id.write(out); - intnum5.write(out); - intnum100.write(out); - intnum.write(out); - longnum.write(out); - floatnum.write(out); - doublenum.write(out); - - } - - } -} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/PartitionStorageDriverAnnotator.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/PartitionStorageDriverAnnotator.java (revision 1208047) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/PartitionStorageDriverAnnotator.java (working copy) @@ -1,114 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hcatalog.utils; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; -import org.apache.hadoop.hive.metastore.api.InvalidOperationException; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; -import org.apache.hadoop.hive.metastore.api.Partition; -import org.apache.hcatalog.rcfile.RCFileInputDriver; -import org.apache.hcatalog.rcfile.RCFileOutputDriver; -import org.apache.thrift.TException; - -/** - * A utility program to annotate partitions of a pre-created table - * with input storage driver and output storage driver information - */ -public class PartitionStorageDriverAnnotator { - - /** - * @param args - * @throws MetaException - * @throws TException - * @throws NoSuchObjectException - * @throws InvalidOperationException - */ - public static void main(String[] args) throws MetaException, NoSuchObjectException, - TException, InvalidOperationException { - String thrifturi = null; - String database = "default"; - String table = null; - String isd = null; - String osd = null; - Map m = new HashMap(); - for(int i = 0; i < args.length; i++) { - if(args[i].equals("-u")) { - thrifturi = args[i+1]; - } else if(args[i].equals("-t")) { - table = args[i+1]; - } else if (args[i].equals("-i")) { - isd = args[i+1]; - } else if (args[i].equals("-o")) { - osd = args[i+1]; - } else if (args[i].equals("-p")) { - String[] kvps = args[i+1].split(";"); - for(String kvp: kvps) { - String[] kv = kvp.split("="); - if(kv.length != 2) { - System.err.println("ERROR: key value property pairs must be specified as key1=val1;key2=val2;..;keyn=valn"); - System.exit(1); - } - m.put(kv[0], kv[1]); - } - } else if(args[i].equals("-d")) { - database = args[i+1]; - } else { - System.err.println("ERROR: Unknown option: " + args[i]); - usage(); - } - i++; // to skip the value for an option - } - if(table == null || thrifturi == null) { - System.err.println("ERROR: thrift uri and table name are mandatory"); - usage(); - } - HiveConf hiveConf = new HiveConf(PartitionStorageDriverAnnotator.class); - hiveConf.set("hive.metastore.local", "false"); - hiveConf.set("hive.metastore.uris", thrifturi); - - HiveMetaStoreClient hmsc = new HiveMetaStoreClient(hiveConf,null); - List parts = hmsc.listPartitions(database, table, Short.MAX_VALUE); - - m.put("hcat.isd", isd != null ? isd : RCFileInputDriver.class.getName()); - m.put("hcat.osd", osd != null ? osd : RCFileOutputDriver.class.getName()); - - for(Partition p: parts) { - p.setParameters(m); - hmsc.alter_partition(database, table, p); - } - } - - /** - * - */ - private static void usage() { - System.err.println("Usage: java -cp testudf.jar: org.apache.hcat.utils.PartitionStorageDriverAnnotator -u -t " + - " [-i input driver classname (Default rcfiledriver)] [-o output driver classname (default rcfiledriver)] " + - " [-p key1=val1;key2=val2;..;keyn=valn (list of key=value property pairs to associate with each partition)]" + - " [-d database (if this not supplied the default database is used)]"); - System.exit(1); - } - -} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/StoreDemo.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/StoreDemo.java (revision 1208047) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/StoreDemo.java (working copy) @@ -1,152 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hcatalog.utils; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Random; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.hcatalog.common.HCatConstants; -import org.apache.hcatalog.data.DefaultHCatRecord; -import org.apache.hcatalog.data.HCatRecord; -import org.apache.hcatalog.data.schema.HCatSchema; -import org.apache.hcatalog.mapreduce.HCatInputFormat; -import org.apache.hcatalog.mapreduce.HCatOutputFormat; -import org.apache.hcatalog.mapreduce.InputJobInfo; -import org.apache.hcatalog.mapreduce.OutputJobInfo; - -/** - * This is a map reduce test for testing hcat which goes against the "numbers" - * table and writes data to another table. It reads data from numbers which - * is an unpartitioned table and adds 10 to each field. It stores the result into - * the datestamp='20100101' partition of the numbers_part_empty_initially table if the second - * command line arg is "part". If the second cmdline arg is "nopart" then the - * result is stored into the 'numbers_nopart_empty_initially' (unpartitioned) table. - * If the second cmdline arg is "nopart_pig", then the result is stored into the - * 'numbers_nopart_pig_empty_initially' (unpartitioned) table with the tinyint - * and smallint columns in "numbers" being stored as "int" (since pig cannot handle - * tinyint and smallint) - * - * Usage: hadoop jar storenumbers <-libjars hive-hcat jar> - If the second argument is "part" data is written to datestamp = '2010101' partition of the numbers_part_empty_initially table. - If the second argument is "nopart", data is written to the unpartitioned numbers_nopart_empty_initially table. - If the second argument is "nopart_pig", data is written to the unpartitioned numbers_nopart_pig_empty_initially table. - The hcat jar location should be specified as file:// - */ -public class StoreDemo { - - private static final String NUMBERS_PARTITIONED_TABLE_NAME = "demo_partitioned"; - private static final String NUMBERS_TABLE_NAME = "demo"; - - public static class SumMapper - extends Mapper{ - - - Integer intnum; - - Double doublenum; - @Override - protected void map(WritableComparable key, HCatRecord value, - org.apache.hadoop.mapreduce.Mapper.Context context) - throws IOException ,InterruptedException { - intnum = ((Integer)value.get(0)); - value.set(0, intnum + 20); - doublenum = ((Double) value.get(1)); - value.set(1, (Double) (doublenum + 20)); - context.write(new IntWritable(0), value); - - } - } - - - public static void main(String[] args) throws Exception { - Configuration conf = new Configuration(); - args = new GenericOptionsParser(conf, args).getRemainingArgs(); - String[] otherArgs = new String[1]; - int j = 0; - for(int i = 0; i < args.length; i++) { - if(args[i].equals("-libjars")) { - // generic options parser doesn't seem to work! - conf.set("tmpjars", args[i+1]); - i = i+1; // skip it , the for loop will skip its value - } else { - otherArgs[j++] = args[i]; - } - } - if (otherArgs.length != 1) { - usage(); - } - String serverUri = otherArgs[0]; - - String tableName = NUMBERS_TABLE_NAME; - String dbName = "default"; - Map outputPartitionKvps = new HashMap(); - String outputTableName = NUMBERS_PARTITIONED_TABLE_NAME; - outputPartitionKvps.put("datestamp", "20100102"); - - String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); - if(principalID != null) - conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); - Job job = new Job(conf, "storedemo"); - // initialize HCatInputFormat - HCatInputFormat.setInput(job, InputJobInfo.create( - dbName, tableName, null, serverUri, principalID)); - // initialize HCatOutputFormat - HCatOutputFormat.setOutput(job, OutputJobInfo.create( - dbName, outputTableName, outputPartitionKvps, serverUri, principalID)); - // test with and without specifying schema randomly - HCatSchema s = HCatInputFormat.getTableSchema(job); - System.err.println("INFO: output schema explicitly set for writing:" + s); - HCatOutputFormat.setSchema(job, s); - - job.setInputFormatClass(HCatInputFormat.class); - job.setOutputFormatClass(HCatOutputFormat.class); - job.setJarByClass(StoreDemo.class); - job.setMapperClass(SumMapper.class); - job.setOutputKeyClass(IntWritable.class); - job.setNumReduceTasks(0); - job.setOutputValueClass(DefaultHCatRecord.class); - System.exit(job.waitForCompletion(true) ? 0 : 1); - } - - - /** - * - */ - private static void usage() { - System.err.println("Usage: hadoop jar storenumbers <-libjars hive-hcat jar>\n" + - "\tIf the second argument is \"part\" data is written to datestamp = '2010101' partition of " + - "the numbers_part_empty_initially table.\n\tIf the second argument is \"nopart\", data is written to " + - "the unpartitioned numbers_nopart_empty_initially table.\n\tIf the second argument is \"nopart_pig\", " + - "data is written to the unpartitioned numbers_nopart_pig_empty_initially table.\nt" + - "The hcat jar location should be specified as file://\n"); - System.exit(2); - - } - - -} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/GroupByAge.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/GroupByAge.java (revision 0) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/GroupByAge.java (revision 0) @@ -0,0 +1,134 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hcatalog.utils; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hcatalog.common.HCatConstants; +import org.apache.hcatalog.data.DefaultHCatRecord; +import org.apache.hcatalog.data.HCatRecord; +import org.apache.hcatalog.data.schema.HCatSchema; +import org.apache.hcatalog.mapreduce.HCatInputFormat; +import org.apache.hcatalog.mapreduce.HCatOutputFormat; +import org.apache.hcatalog.mapreduce.InputJobInfo; +import org.apache.hcatalog.mapreduce.OutputJobInfo; + +/** + * This is a map reduce test for testing hcat which goes against the "numbers" + * table. It performs a group by on the first column and a SUM operation on the + * other columns. This is to simulate a typical operation in a map reduce + * program to test that hcat hands the right data to the map reduce program + * + * Usage: hadoop jar sumnumbers <-libjars hive-hcat + * jar> The argument controls the output delimiter The hcat jar + * location should be specified as file:// + */ +public class GroupByAge extends Configured implements Tool { + + public static class Map extends + Mapper { + + int age; + + @Override + protected void map( + WritableComparable key, + HCatRecord value, + org.apache.hadoop.mapreduce.Mapper.Context context) + throws IOException, InterruptedException { + age = (Integer) value.get(1); + context.write(new IntWritable(age), new IntWritable(1)); + } + } + + public static class Reduce extends Reducer { + + + @Override + protected void reduce(IntWritable key, java.lang.Iterable + values, org.apache.hadoop.mapreduce.Reducer.Context context) + throws IOException ,InterruptedException { + int sum = 0; + Iterator iter = values.iterator(); + while (iter.hasNext()) { + sum++; + iter.next(); + } + HCatRecord record = new DefaultHCatRecord(2); + record.set(0, key.get()); + record.set(1, sum); + + context.write(null, record); + } + } + + public int run(String[] args) throws Exception { + Configuration conf = getConf(); + args = new GenericOptionsParser(conf, args).getRemainingArgs(); + + String serverUri = args[0]; + String inputTableName = args[1]; + String outputTableName = args[2]; + String dbName = null; + + String principalID = System + .getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); + if (principalID != null) + conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); + Job job = new Job(conf, "GroupByAge"); + HCatInputFormat.setInput(job, InputJobInfo.create(dbName, + inputTableName, null, serverUri, principalID)); + // initialize HCatOutputFormat + + job.setInputFormatClass(HCatInputFormat.class); + job.setJarByClass(GroupByAge.class); + job.setMapperClass(Map.class); + job.setReducerClass(Reduce.class); + job.setMapOutputKeyClass(IntWritable.class); + job.setMapOutputValueClass(IntWritable.class); + job.setOutputKeyClass(WritableComparable.class); + job.setOutputValueClass(DefaultHCatRecord.class); + HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, + outputTableName, null, serverUri, principalID)); + HCatSchema s = HCatOutputFormat.getTableSchema(job); + System.err.println("INFO: output schema explicitly set for writing:" + + s); + HCatOutputFormat.setSchema(job, s); + job.setOutputFormatClass(HCatOutputFormat.class); + return (job.waitForCompletion(true) ? 0 : 1); + } + + public static void main(String[] args) throws Exception { + int exitCode = ToolRunner.run(new GroupByAge(), args); + System.exit(exitCode); + } +} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadJson.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadJson.java (revision 0) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadJson.java (revision 0) @@ -0,0 +1,115 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hcatalog.utils; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hcatalog.common.HCatConstants; +import org.apache.hcatalog.data.DefaultHCatRecord; +import org.apache.hcatalog.data.HCatRecord; +import org.apache.hcatalog.mapreduce.HCatInputFormat; +import org.apache.hcatalog.mapreduce.InputJobInfo; +import org.apache.pig.data.DataBag; + +/** + * This is a map reduce test for testing hcat which goes against the "numbers" + * table. It performs a group by on the first column and a SUM operation on the + * other columns. This is to simulate a typical operation in a map reduce program + * to test that hcat hands the right data to the map reduce program + * + * Usage: hadoop jar sumnumbers <-libjars hive-hcat jar> + The argument controls the output delimiter + The hcat jar location should be specified as file:// + */ +public class ReadJson extends Configured implements Tool { + + public static class Map + extends Mapper{ + + String s; + Integer i; + Double d; + + @Override + protected void map(WritableComparable key, HCatRecord value, + org.apache.hadoop.mapreduce.Mapper.Context context) + throws IOException ,InterruptedException { + s = value.get(0)==null?null:(String)value.get(0); + i = value.get(1)==null?null:(Integer)value.get(1); + d = value.get(2)==null?null:(Double)value.get(2); + + HCatRecord record = new DefaultHCatRecord(3); + record.set(0, s); + record.set(1, i); + record.set(2, d); + + context.write(null, record); + + } + } + + public int run(String[] args) throws Exception { + Configuration conf = getConf(); + args = new GenericOptionsParser(conf, args).getRemainingArgs(); + + String serverUri = args[0]; + String tableName = args[1]; + String outputDir = args[2]; + String dbName = null; + + String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); + if(principalID != null) + conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); + Job job = new Job(conf, "ReadJson"); + HCatInputFormat.setInput(job, InputJobInfo.create( + dbName, tableName, null, serverUri, principalID)); + // initialize HCatOutputFormat + + job.setInputFormatClass(HCatInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + job.setJarByClass(ReadJson.class); + job.setMapperClass(Map.class); + job.setOutputKeyClass(IntWritable.class); + job.setOutputValueClass(HCatRecord.class); + job.setNumReduceTasks(0); + FileOutputFormat.setOutputPath(job, new Path(outputDir)); + return (job.waitForCompletion(true) ? 0 : 1); + } + + public static void main(String[] args) throws Exception { + int exitCode = ToolRunner.run(new ReadJson(), args); + System.exit(exitCode); + } +} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/TypeDataCheck.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/TypeDataCheck.java (revision 1208047) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/TypeDataCheck.java (working copy) @@ -1,182 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hcatalog.utils; - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; -import org.apache.hcatalog.utils.Util; -import org.apache.hcatalog.common.HCatConstants; -import org.apache.hcatalog.data.HCatRecord; -import org.apache.hcatalog.data.schema.HCatSchema; -import org.apache.hcatalog.mapreduce.HCatInputFormat; -import org.apache.hcatalog.mapreduce.InputJobInfo; - -/** - * This is a map reduce test for testing hcat that checks that the columns - * handed by hcat have the right type and right values. It achieves the first - * objective by checking the type of the Objects representing the columns against - * the schema provided as a cmdline arg. It achieves the second objective by - * writing the data as Text to be compared against golden results. - * - * The schema specification consists of the types as given by "describe " - * with each column's type separated from the next column's type by a '+' - * - * Can be used against "numbers" and "complex" tables. - * - * Usage: hadoop jar testudf.jar typedatacheck - * <-libjars hive-hcat jar> - The argument controls the output delimiter. - The hcat jar location should be specified as file:// - */ -public class TypeDataCheck implements Tool{ - - static String SCHEMA_KEY = "schema"; - static String DELIM = "delim"; - private static Configuration conf = new Configuration(); - - public static class TypeDataCheckMapper - extends Mapper{ - - Long dummykey = null; - String[] types; - String delim = "\u0001"; - @Override - protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context) - throws IOException ,InterruptedException { - String typesStr = context.getConfiguration().get(SCHEMA_KEY); - delim = context.getConfiguration().get(DELIM); - if(delim.equals("tab")) { - delim = "\t"; - } else if (delim.equals("ctrla")) { - delim = "\u0001"; - } - types = typesStr.split("\\+"); - for(int i = 0; i < types.length; i++) { - types[i] = types[i].toLowerCase(); - } - - - } - - String check(HCatRecord r) throws IOException { - String s = ""; - for(int i = 0; i < r.size(); i++) { - s += Util.check(types[i], r.get(i)); - if(i != r.size() - 1) { - s += delim; - } - } - return s; - } - - @Override - protected void map(WritableComparable key, HCatRecord value, - org.apache.hadoop.mapreduce.Mapper.Context context) - throws IOException ,InterruptedException { - context.write(dummykey, new Text(check(value))); - } - } - - public static void main(String[] args) throws Exception { - TypeDataCheck self = new TypeDataCheck(); - System.exit(ToolRunner.run(conf, self, args)); - } - - public int run(String[] args) { - try { - args = new GenericOptionsParser(conf, args).getRemainingArgs(); - String[] otherArgs = new String[5]; - int j = 0; - for(int i = 0; i < args.length; i++) { - if(args[i].equals("-libjars")) { - conf.set("tmpjars",args[i+1]); - i = i+1; // skip it , the for loop will skip its value - } else { - otherArgs[j++] = args[i]; - } - } - if (otherArgs.length !=5 ) { - System.err.println("Other args:" + Arrays.asList(otherArgs)); - System.err.println("Usage: hadoop jar testudf.jar typedatacheck " + - " " + - " <-libjars hive-hcat jar>\n" + - "The argument controls the output delimiter.\n" + - "The hcat jar location should be specified as file://\n"); - System.err.println(" The argument controls the output delimiter."); - System.exit(2); - } - String serverUri = otherArgs[0]; - String tableName = otherArgs[1]; - String schemaStr = otherArgs[2]; - String outputDir = otherArgs[3]; - String outputdelim = otherArgs[4]; - if(!outputdelim.equals("tab") && !outputdelim.equals("ctrla")) { - System.err.println("ERROR: Specify 'tab' or 'ctrla' for output delimiter"); - } - String dbName = "default"; - - String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); - if(principalID != null){ - conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); - } - Job job = new Job(conf, "typedatacheck"); - // initialize HCatInputFormat - HCatInputFormat.setInput(job, InputJobInfo.create( - dbName, tableName, null, serverUri, principalID)); - HCatSchema s = HCatInputFormat.getTableSchema(job); - job.getConfiguration().set(SCHEMA_KEY, schemaStr); - job.getConfiguration().set(DELIM, outputdelim); - job.setInputFormatClass(HCatInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - job.setJarByClass(TypeDataCheck.class); - job.setMapperClass(TypeDataCheckMapper.class); - job.setNumReduceTasks(0); - job.setOutputKeyClass(Long.class); - job.setOutputValueClass(Text.class); - FileOutputFormat.setOutputPath(job, new Path(outputDir)); - System.exit(job.waitForCompletion(true) ? 0 : 1); - return 0; - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - @Override - public Configuration getConf() { - return conf; - } - - @Override - public void setConf(Configuration conf) { - TypeDataCheck.conf = conf; - } - -} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/Util.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/Util.java (revision 1208047) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/Util.java (working copy) @@ -1,105 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hcatalog.utils; - -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; - -public class Util { - - static Map> typeMap = new HashMap>(); - - static{ - typeMap.put("tinyint", Byte.class); - typeMap.put("smallint", Short.class); - typeMap.put("int", Integer.class); - typeMap.put("bigint", Long.class); - typeMap.put("float", Float.class); - typeMap.put("double", Double.class); - typeMap.put("string", String.class); - typeMap.put("boolean", Boolean.class); - typeMap.put("struct", List.class); - typeMap.put("map", Map.class); - typeMap.put("array>", List.class); - } - - public static void die(String expectedType, Object o) throws IOException { - throw new IOException("Expected " + expectedType + ", got " + - o.getClass().getName()); - } - - - public static String check(String type, Object o) throws IOException { - if(o == null) { - return "null"; - } - if(check(typeMap.get(type), o)) { - if(type.equals("map")) { - Map m = (Map) o; - check(m); - } else if(type.equals("array>")) { - List> listOfMaps = (List>) o; - for(Map m: listOfMaps) { - check(m); - } - } else if(type.equals("struct")) { - List l = (List) o; - if(!check(Integer.class, l.get(0)) || - !check(String.class, l.get(1)) || - !check(Double.class, l.get(2))) { - die("struct", l); - } - } - } else { - die(typeMap.get(type).getName(), o); - } - return o.toString(); - } - - /** - * @param m - * @throws IOException - */ - public static void check(Map m) throws IOException { - if(m == null) { - return; - } - for(Entry e: m.entrySet()) { - // just access key and value to ensure they are correct - if(!check(String.class, e.getKey())) { - die("String", e.getKey()); - } - if(!check(String.class, e.getValue())) { - die("String", e.getValue()); - } - } - - } - - public static boolean check(Class expected, Object actual) { - if(actual == null) { - return true; - } - return expected.isAssignableFrom(actual.getClass()); - } - -} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/WriteRC.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/WriteRC.java (revision 0) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/WriteRC.java (revision 0) @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hcatalog.utils; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hcatalog.common.HCatConstants; +import org.apache.hcatalog.data.DefaultHCatRecord; +import org.apache.hcatalog.data.HCatRecord; +import org.apache.hcatalog.data.schema.HCatSchema; +import org.apache.hcatalog.mapreduce.HCatInputFormat; +import org.apache.hcatalog.mapreduce.HCatOutputFormat; +import org.apache.hcatalog.mapreduce.InputJobInfo; +import org.apache.hcatalog.mapreduce.OutputJobInfo; + +/** + * This is a map reduce test for testing hcat which goes against the "numbers" + * table. It performs a group by on the first column and a SUM operation on the + * other columns. This is to simulate a typical operation in a map reduce + * program to test that hcat hands the right data to the map reduce program + * + * Usage: hadoop jar sumnumbers <-libjars hive-hcat + * jar> The argument controls the output delimiter The hcat jar + * location should be specified as file:// + */ +public class WriteRC extends Configured implements Tool { + + public static class Map extends + Mapper { + + String name; + Integer age; + Double gpa; + + @Override + protected void map( + WritableComparable key, + HCatRecord value, + org.apache.hadoop.mapreduce.Mapper.Context context) + throws IOException, InterruptedException { + name = value.get(0)==null?null:(String)value.get(0); + age = value.get(1)==null?null:(Integer)value.get(1); + gpa = value.get(2)==null?null:(Double)value.get(2); + + HCatRecord record = new DefaultHCatRecord(5); + record.set(0, name); + record.set(1, age); + record.set(2, gpa); + + context.write(null, record); + + } + } + + public int run(String[] args) throws Exception { + Configuration conf = getConf(); + args = new GenericOptionsParser(conf, args).getRemainingArgs(); + + String serverUri = args[0]; + String inputTableName = args[1]; + String outputTableName = args[2]; + String dbName = null; + + String principalID = System + .getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); + if (principalID != null) + conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); + Job job = new Job(conf, "WriteRC"); + HCatInputFormat.setInput(job, InputJobInfo.create(dbName, + inputTableName, null, serverUri, principalID)); + // initialize HCatOutputFormat + + job.setInputFormatClass(HCatInputFormat.class); + job.setJarByClass(WriteRC.class); + job.setMapperClass(Map.class); + job.setOutputKeyClass(WritableComparable.class); + job.setOutputValueClass(DefaultHCatRecord.class); + job.setNumReduceTasks(0); + HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, + outputTableName, null, serverUri, principalID)); + HCatSchema s = HCatInputFormat.getTableSchema(job); + System.err.println("INFO: output schema explicitly set for writing:" + + s); + HCatOutputFormat.setSchema(job, s); + job.setOutputFormatClass(HCatOutputFormat.class); + return (job.waitForCompletion(true) ? 0 : 1); + } + + public static void main(String[] args) throws Exception { + int exitCode = ToolRunner.run(new WriteRC(), args); + System.exit(exitCode); + } +} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/HCatTypeCheck.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/HCatTypeCheck.java (revision 1208047) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/HCatTypeCheck.java (working copy) @@ -1,152 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hcatalog.utils; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Map.Entry; - -import org.apache.pig.EvalFunc; -import org.apache.pig.data.DataBag; -import org.apache.pig.data.DataType; -import org.apache.pig.data.Tuple; -import org.apache.pig.impl.logicalLayer.schema.Schema; -import org.apache.pig.impl.util.Utils; - -/** - * This UDF can be used to check that a tuple presented by HCatLoader has the - * right types for the fields - * - * Usage is : - * - * register testudf.jar; - * a = load 'numbers' using HCatLoader(...); - * b = foreach a generate HCatTypeCheck('intnum1000:int,id:int,intnum5:int,intnum100:int,intnum:int,longnum:long,floatnum:float,doublenum:double', *); - * store b into 'output'; - * - * The schema string (the first argument to the UDF) is of the form one would provide in a - * pig load statement. - * - * The output should only contain the value '1' in all rows. (This UDF returns - * the integer value 1 if all fields have the right type, else throws IOException) - * - */ -public class HCatTypeCheck extends EvalFunc { - - static HashMap> typeMap = new HashMap>(); - - @Override - public Integer exec(Tuple input) throws IOException { - String schemaStr = (String) input.get(0); - Schema s = null; - try { - s = getSchemaFromString(schemaStr); - } catch (Exception e) { - throw new IOException(e); - } - for(int i = 0; i < s.size(); i++) { - check(s.getField(i).type, input.get(i+1)); // input.get(i+1) since input.get(0) is the schema; - } - return 1; - } - - static { - typeMap.put(DataType.INTEGER, Integer.class); - typeMap.put(DataType.LONG, Long.class); - typeMap.put(DataType.FLOAT, Float.class); - typeMap.put(DataType.DOUBLE, Double.class); - typeMap.put(DataType.CHARARRAY, String.class); - typeMap.put(DataType.TUPLE, Tuple.class); - typeMap.put(DataType.MAP, Map.class); - typeMap.put(DataType.BAG, DataBag.class); - } - - - - private void die(String expectedType, Object o) throws IOException { - throw new IOException("Expected " + expectedType + ", got " + - o.getClass().getName()); - } - - - private String check(Byte type, Object o) throws IOException { - if(o == null) { - return ""; - } - if(check(typeMap.get(type), o)) { - if(type.equals(DataType.MAP)) { - Map m = (Map) o; - check(m); - } else if(type.equals(DataType.BAG)) { - DataBag bg = (DataBag) o; - for (Tuple tuple : bg) { - Map m = (Map) tuple.get(0); - check(m); - } - } else if(type.equals(DataType.TUPLE)) { - Tuple t = (Tuple) o; - if(!check(Integer.class, t.get(0)) || - !check(String.class, t.get(1)) || - !check(Double.class, t.get(2))) { - die("t:tuple(num:int,str:string,dbl:double)", t); - } - } - } else { - die(typeMap.get(type).getName(), o); - } - return o.toString(); - } - - /** - * @param m - * @throws IOException - */ - private void check(Map m) throws IOException { - for(Entry e: m.entrySet()) { - // just access key and value to ensure they are correct - if(!check(String.class, e.getKey())) { - die("String", e.getKey()); - } - if(!check(String.class, e.getValue())) { - die("String", e.getValue()); - } - } - - } - - private boolean check(Class expected, Object actual) { - if(actual == null) { - return true; - } - return expected.isAssignableFrom(actual.getClass()); - } - - Schema getSchemaFromString(String schemaString) throws Exception { - /** ByteArrayInputStream stream = new ByteArrayInputStream(schemaString.getBytes()) ; - QueryParser queryParser = new QueryParser(stream) ; - Schema schema = queryParser.TupleSchema() ; - Schema.setSchemaDefaultType(schema, org.apache.pig.data.DataType.BYTEARRAY); - return schema; - */ - return Utils.getSchemaFromString(schemaString); - } - -} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadText.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadText.java (revision 0) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadText.java (revision 0) @@ -0,0 +1,125 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hcatalog.utils; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hcatalog.common.HCatConstants; +import org.apache.hcatalog.data.DefaultHCatRecord; +import org.apache.hcatalog.data.HCatRecord; +import org.apache.hcatalog.mapreduce.HCatInputFormat; +import org.apache.hcatalog.mapreduce.InputJobInfo; + +/** + * This is a map reduce test for testing hcat which goes against the "numbers" + * table. It performs a group by on the first column and a SUM operation on the + * other columns. This is to simulate a typical operation in a map reduce program + * to test that hcat hands the right data to the map reduce program + * + * Usage: hadoop jar sumnumbers <-libjars hive-hcat jar> + The argument controls the output delimiter + The hcat jar location should be specified as file:// + */ +public class ReadText extends Configured implements Tool { + + public static class Map + extends Mapper{ + + int t; + int si; + int i; + long b; + float f; + double d; + String s; + + @Override + protected void map(WritableComparable key, HCatRecord value, + org.apache.hadoop.mapreduce.Mapper.Context context) + throws IOException ,InterruptedException { + t = (Integer)value.get(0); + si = (Integer)value.get(1); + i = (Integer)value.get(2); + b = (Long)value.get(3); + f = (Float)value.get(4); + d = (Double)value.get(5); + s = (String)value.get(6); + + HCatRecord record = new DefaultHCatRecord(7); + record.set(0, t); + record.set(1, si); + record.set(2, i); + record.set(3, b); + record.set(4, f); + record.set(5, d); + record.set(6, s); + + context.write(null, record); + + } + } + + public int run(String[] args) throws Exception { + Configuration conf = getConf(); + args = new GenericOptionsParser(conf, args).getRemainingArgs(); + + String serverUri = args[0]; + String tableName = args[1]; + String outputDir = args[2]; + String dbName = null; + + String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); + if(principalID != null) + conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); + Job job = new Job(conf, "ReadText"); + HCatInputFormat.setInput(job, InputJobInfo.create( + dbName, tableName, null, serverUri, principalID)); + // initialize HCatOutputFormat + + job.setInputFormatClass(HCatInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + job.setJarByClass(ReadText.class); + job.setMapperClass(Map.class); + job.setOutputKeyClass(IntWritable.class); + job.setOutputValueClass(HCatRecord.class); + job.setNumReduceTasks(0); + FileOutputFormat.setOutputPath(job, new Path(outputDir)); + return (job.waitForCompletion(true) ? 0 : 1); + } + + public static void main(String[] args) throws Exception { + int exitCode = ToolRunner.run(new ReadText(), args); + System.exit(exitCode); + } +} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/StoreComplex.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/StoreComplex.java (revision 1208047) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/StoreComplex.java (working copy) @@ -1,135 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hcatalog.utils; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.Random; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.hcatalog.common.HCatConstants; -import org.apache.hcatalog.data.DefaultHCatRecord; -import org.apache.hcatalog.data.HCatRecord; -import org.apache.hcatalog.data.schema.HCatSchema; -import org.apache.hcatalog.mapreduce.HCatInputFormat; -import org.apache.hcatalog.mapreduce.HCatOutputFormat; -import org.apache.hcatalog.mapreduce.InputJobInfo; -import org.apache.hcatalog.mapreduce.OutputJobInfo; - -/** - * This is a map reduce test for testing hcat which goes against the "complex" - * table and writes to "complex_nopart_empty_initially" table. It reads data from complex which - * is an unpartitioned table and stores the data as-is into complex_empty_initially table - * (which is also unpartitioned) - * - * Usage: hadoop jar testudf.jar storecomplex <-libjars hive-hcat jar> - The hcat jar location should be specified as file:// - */ -public class StoreComplex { - - private static final String COMPLEX_TABLE_NAME = "complex"; - private static final String COMPLEX_NOPART_EMPTY_INITIALLY_TABLE_NAME = "complex_nopart_empty_initially"; - - - public static class ComplexMapper - extends Mapper{ - - @Override - protected void map(WritableComparable key, HCatRecord value, - org.apache.hadoop.mapreduce.Mapper.Context context) - throws IOException ,InterruptedException { - // just write out the value as-is - context.write(new IntWritable(0), value); - - } - } - - - public static void main(String[] args) throws Exception { - Configuration conf = new Configuration(); - args = new GenericOptionsParser(conf, args).getRemainingArgs(); - String[] otherArgs = new String[1]; - int j = 0; - for(int i = 0; i < args.length; i++) { - if(args[i].equals("-libjars")) { - // generic options parser doesn't seem to work! - conf.set("tmpjars", args[i+1]); - i = i+1; // skip it , the for loop will skip its value - } else { - otherArgs[j++] = args[i]; - } - } - if (otherArgs.length != 1) { - usage(); - } - String serverUri = otherArgs[0]; - String tableName = COMPLEX_TABLE_NAME; - String dbName = "default"; - Map outputPartitionKvps = new HashMap(); - String outputTableName = null; - outputTableName = COMPLEX_NOPART_EMPTY_INITIALLY_TABLE_NAME; - // test with null or empty randomly - if(new Random().nextInt(2) == 0) { - System.err.println("INFO: output partition keys set to null for writing"); - outputPartitionKvps = null; - } - String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); - if(principalID != null) - conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); - Job job = new Job(conf, "storecomplex"); - // initialize HCatInputFormat - - HCatInputFormat.setInput(job, InputJobInfo.create( - dbName, tableName, null, serverUri, principalID)); - // initialize HCatOutputFormat - HCatOutputFormat.setOutput(job, OutputJobInfo.create( - dbName, outputTableName, outputPartitionKvps, serverUri, principalID)); - - - HCatSchema s = HCatInputFormat.getTableSchema(job); - HCatOutputFormat.setSchema(job, s); - job.setInputFormatClass(HCatInputFormat.class); - job.setOutputFormatClass(HCatOutputFormat.class); - job.setJarByClass(StoreComplex.class); - job.setMapperClass(ComplexMapper.class); - job.setOutputKeyClass(IntWritable.class); - job.setOutputValueClass(DefaultHCatRecord.class); - System.exit(job.waitForCompletion(true) ? 0 : 1); - } - - - /** - * - */ - private static void usage() { - System.err.println("Usage: hadoop jar testudf.jar storecomplex <-libjars hive-hcat jar>\n" + - "The hcat jar location should be specified as file://\n"); - System.exit(2); - - } - - -} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/WriteJson.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/WriteJson.java (revision 0) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/WriteJson.java (revision 0) @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hcatalog.utils; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hcatalog.common.HCatConstants; +import org.apache.hcatalog.data.DefaultHCatRecord; +import org.apache.hcatalog.data.HCatRecord; +import org.apache.hcatalog.data.schema.HCatSchema; +import org.apache.hcatalog.mapreduce.HCatInputFormat; +import org.apache.hcatalog.mapreduce.HCatOutputFormat; +import org.apache.hcatalog.mapreduce.InputJobInfo; +import org.apache.hcatalog.mapreduce.OutputJobInfo; + +/** + * This is a map reduce test for testing hcat which goes against the "numbers" + * table. It performs a group by on the first column and a SUM operation on the + * other columns. This is to simulate a typical operation in a map reduce + * program to test that hcat hands the right data to the map reduce program + * + * Usage: hadoop jar sumnumbers <-libjars hive-hcat + * jar> The argument controls the output delimiter The hcat jar + * location should be specified as file:// + */ +public class WriteJson extends Configured implements Tool { + + public static class Map extends + Mapper { + + String s; + Integer i; + Double d; + + @Override + protected void map( + WritableComparable key, + HCatRecord value, + org.apache.hadoop.mapreduce.Mapper.Context context) + throws IOException, InterruptedException { + s = value.get(0)==null?null:(String)value.get(0); + i = value.get(1)==null?null:(Integer)value.get(1); + d = value.get(2)==null?null:(Double)value.get(2); + + HCatRecord record = new DefaultHCatRecord(5); + record.set(0, s); + record.set(1, i); + record.set(2, d); + + context.write(null, record); + + } + } + + public int run(String[] args) throws Exception { + Configuration conf = getConf(); + args = new GenericOptionsParser(conf, args).getRemainingArgs(); + + String serverUri = args[0]; + String inputTableName = args[1]; + String outputTableName = args[2]; + String dbName = null; + + String principalID = System + .getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); + if (principalID != null) + conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); + Job job = new Job(conf, "WriteJson"); + HCatInputFormat.setInput(job, InputJobInfo.create(dbName, + inputTableName, null, serverUri, principalID)); + // initialize HCatOutputFormat + + job.setInputFormatClass(HCatInputFormat.class); + job.setJarByClass(WriteJson.class); + job.setMapperClass(Map.class); + job.setOutputKeyClass(WritableComparable.class); + job.setOutputValueClass(DefaultHCatRecord.class); + job.setNumReduceTasks(0); + HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, + outputTableName, null, serverUri, principalID)); + HCatSchema s = HCatInputFormat.getTableSchema(job); + System.err.println("INFO: output schema explicitly set for writing:" + + s); + HCatOutputFormat.setSchema(job, s); + job.setOutputFormatClass(HCatOutputFormat.class); + return (job.waitForCompletion(true) ? 0 : 1); + } + + public static void main(String[] args) throws Exception { + int exitCode = ToolRunner.run(new WriteJson(), args); + System.exit(exitCode); + } +} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/SimpleRead.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/SimpleRead.java (revision 0) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/SimpleRead.java (revision 0) @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hcatalog.utils; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DoubleWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hcatalog.common.HCatConstants; +import org.apache.hcatalog.data.HCatRecord; +import org.apache.hcatalog.mapreduce.HCatInputFormat; +import org.apache.hcatalog.mapreduce.InputJobInfo; + +/** + * This is a map reduce test for testing hcat which goes against the "numbers" + * table. It performs a group by on the first column and a SUM operation on the + * other columns. This is to simulate a typical operation in a map reduce program + * to test that hcat hands the right data to the map reduce program + * + * Usage: hadoop jar sumnumbers <-libjars hive-hcat jar> + The argument controls the output delimiter + The hcat jar location should be specified as file:// + */ +public class SimpleRead extends Configured implements Tool { + + private static final String TABLE_NAME = "studenttab10k"; + private static final String TAB = "\t"; + + public static class Map + extends Mapper{ + + String name; + int age; + double gpa; + + @Override + protected void map(WritableComparable key, HCatRecord value, + org.apache.hadoop.mapreduce.Mapper.Context context) + throws IOException ,InterruptedException { + name = (String) value.get(0); + age = (Integer) value.get(1); + gpa = (Double) value.get(2); + context.write(new Text(name), new DoubleWritable(gpa)); + + } + } + + public int run(String[] args) throws Exception { + Configuration conf = getConf(); + args = new GenericOptionsParser(conf, args).getRemainingArgs(); + + String serverUri = args[0]; + String tableName = args[1]; + String outputDir = args[2]; + String dbName = null; + + String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); + if(principalID != null) + conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); + Job job = new Job(conf, "SimpleRead"); + HCatInputFormat.setInput(job, InputJobInfo.create( + dbName, tableName, null, serverUri, principalID)); + // initialize HCatOutputFormat + + job.setInputFormatClass(HCatInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + job.setJarByClass(SimpleRead.class); + job.setMapperClass(Map.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(DoubleWritable.class); + FileOutputFormat.setOutputPath(job, new Path(outputDir)); + return (job.waitForCompletion(true) ? 0 : 1); + } + + public static void main(String[] args) throws Exception { + int exitCode = ToolRunner.run(new SimpleRead(), args); + System.exit(exitCode); + } +} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadWrite.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadWrite.java (revision 0) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/ReadWrite.java (revision 0) @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hcatalog.utils; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.util.GenericOptionsParser; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hcatalog.common.HCatConstants; +import org.apache.hcatalog.data.DefaultHCatRecord; +import org.apache.hcatalog.data.HCatRecord; +import org.apache.hcatalog.data.schema.HCatSchema; +import org.apache.hcatalog.mapreduce.HCatInputFormat; +import org.apache.hcatalog.mapreduce.HCatOutputFormat; +import org.apache.hcatalog.mapreduce.InputJobInfo; +import org.apache.hcatalog.mapreduce.OutputJobInfo; + +/** + * This is a map reduce test for testing hcat which goes against the "numbers" + * table. It performs a group by on the first column and a SUM operation on the + * other columns. This is to simulate a typical operation in a map reduce + * program to test that hcat hands the right data to the map reduce program + * + * Usage: hadoop jar sumnumbers <-libjars hive-hcat + * jar> The argument controls the output delimiter The hcat jar + * location should be specified as file:// + */ +public class ReadWrite extends Configured implements Tool { + + public static class Map extends + Mapper { + + String name; + int age; + double gpa; + + @Override + protected void map( + WritableComparable key, + HCatRecord value, + org.apache.hadoop.mapreduce.Mapper.Context context) + throws IOException, InterruptedException { + name = (String) value.get(0); + age = (Integer) value.get(1); + gpa = (Double) value.get(2); + context.write(new Text(name), value); + + } + } + + public int run(String[] args) throws Exception { + Configuration conf = getConf(); + args = new GenericOptionsParser(conf, args).getRemainingArgs(); + + String serverUri = args[0]; + String inputTableName = args[1]; + String outputTableName = args[2]; + String dbName = null; + + String principalID = System + .getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); + if (principalID != null) + conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); + Job job = new Job(conf, "ReadWrite"); + HCatInputFormat.setInput(job, InputJobInfo.create(dbName, + inputTableName, null, serverUri, principalID)); + // initialize HCatOutputFormat + + job.setInputFormatClass(HCatInputFormat.class); + job.setJarByClass(ReadWrite.class); + job.setMapperClass(Map.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(DefaultHCatRecord.class); + HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, + outputTableName, null, serverUri, principalID)); + HCatSchema s = HCatInputFormat.getTableSchema(job); + System.err.println("INFO: output schema explicitly set for writing:" + + s); + HCatOutputFormat.setSchema(job, s); + job.setOutputFormatClass(HCatOutputFormat.class); + return (job.waitForCompletion(true) ? 0 : 1); + } + + public static void main(String[] args) throws Exception { + int exitCode = ToolRunner.run(new ReadWrite(), args); + System.exit(exitCode); + } +} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/StoreNumbers.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/StoreNumbers.java (revision 1208047) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/StoreNumbers.java (working copy) @@ -1,232 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hcatalog.utils; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Random; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.util.GenericOptionsParser; -import org.apache.hcatalog.common.HCatConstants; -import org.apache.hcatalog.data.DefaultHCatRecord; -import org.apache.hcatalog.data.HCatRecord; -import org.apache.hcatalog.data.schema.HCatFieldSchema; -import org.apache.hcatalog.data.schema.HCatSchema; -import org.apache.hcatalog.mapreduce.HCatInputFormat; -import org.apache.hcatalog.mapreduce.HCatOutputFormat; -import org.apache.hcatalog.mapreduce.InputJobInfo; -import org.apache.hcatalog.mapreduce.OutputJobInfo; - -/** - * This is a map reduce test for testing hcat which goes against the "numbers" - * table and writes data to another table. It reads data from numbers which - * is an unpartitioned table and adds 10 to each field. It stores the result into - * the datestamp='20100101' partition of the numbers_part_empty_initially table if the second - * command line arg is "part". If the second cmdline arg is "nopart" then the - * result is stored into the 'numbers_nopart_empty_initially' (unpartitioned) table. - * If the second cmdline arg is "nopart_pig", then the result is stored into the - * 'numbers_nopart_pig_empty_initially' (unpartitioned) table with the tinyint - * and smallint columns in "numbers" being stored as "int" (since pig cannot handle - * tinyint and smallint) - * - * Usage: hadoop jar storenumbers <-libjars hive-hcat jar> - If the second argument is "part" data is written to datestamp = '2010101' partition of the numbers_part_empty_initially table. - If the second argument is "nopart", data is written to the unpartitioned numbers_nopart_empty_initially table. - If the second argument is "nopart_pig", data is written to the unpartitioned numbers_nopart_pig_empty_initially table. - The hcat jar location should be specified as file:// - */ -public class StoreNumbers { - - private static final String NUMBERS_PARTITIONED_TABLE_NAME = "numbers_part_empty_initially"; - private static final String NUMBERS_TABLE_NAME = "numbers"; - private static final String NUMBERS_NON_PARTITIONED_TABLE_NAME = "numbers_nopart_empty_initially"; - private static final String NUMBERS_NON_PARTITIONED_PIG_TABLE_NAME = "numbers_nopart_pig_empty_initially"; - private static final String IS_PIG_NON_PART_TABLE = "is.pig.non.part.table"; - - public static class SumMapper - extends Mapper{ - - Integer intnum1000; - // though id is given as a Short by hcat, the map will emit it as an - // IntWritable so we can just sum in the reduce - Short id; - - // though intnum5 is handed as a Byte by hcat, the map() will emit it as - // an IntWritable so we can just sum in the reduce - Byte intnum5; - Integer intnum100; - Integer intnum; - Long longnum; - Float floatnum; - Double doublenum; - @Override - protected void map(WritableComparable key, HCatRecord value, - org.apache.hadoop.mapreduce.Mapper.Context context) - throws IOException ,InterruptedException { - boolean isnoPartPig = context.getConfiguration().getBoolean(IS_PIG_NON_PART_TABLE, false); - intnum1000 = ((Integer)value.get(0)); - id = ((Short) value.get(1)); - intnum5 = (((Byte)value.get(2))); - intnum100 = (((Integer) value.get(3))); - intnum = ((Integer) value.get(4)); - longnum = ((Long) value.get(5)); - floatnum = ((Float) value.get(6)); - doublenum = ((Double) value.get(7)); - HCatRecord output = new DefaultHCatRecord(8); - output.set(0, intnum1000 + 10); - if(isnoPartPig) - { - output.set(1, ((int)(id + 10))); - } else { - output.set(1, ((short)(id + 10))); - } - if(isnoPartPig) { - output.set(2, (int)(intnum5 + 10)); - } else { - output.set(2, (byte) (intnum5 + 10)); - } - - output.set(3, intnum100 + 10); - output.set(4, intnum + 10); - output.set(5, (long) (longnum + 10)); - output.set(6, (float) (floatnum + 10)); - output.set(7, (double) (doublenum + 10)); - for(int i = 0; i < 8; i++) { - System.err.println("XXX: class:" + output.get(i).getClass()); - } - context.write(new IntWritable(0), output); - - } - } - - - public static void main(String[] args) throws Exception { - Configuration conf = new Configuration(); - args = new GenericOptionsParser(conf, args).getRemainingArgs(); - String[] otherArgs = new String[2]; - int j = 0; - for(int i = 0; i < args.length; i++) { - if(args[i].equals("-libjars")) { - // generic options parser doesn't seem to work! - conf.set("tmpjars", args[i+1]); - i = i+1; // skip it , the for loop will skip its value - } else { - otherArgs[j++] = args[i]; - } - } - if (otherArgs.length != 2) { - usage(); - } - String serverUri = otherArgs[0]; - if(otherArgs[1] == null || ( - !otherArgs[1].equalsIgnoreCase("part") && !otherArgs[1].equalsIgnoreCase("nopart")) - && !otherArgs[1].equalsIgnoreCase("nopart_pig")) { - usage(); - } - boolean writeToPartitionedTable = (otherArgs[1].equalsIgnoreCase("part")); - boolean writeToNonPartPigTable = (otherArgs[1].equalsIgnoreCase("nopart_pig")); - String tableName = NUMBERS_TABLE_NAME; - String dbName = "default"; - Map outputPartitionKvps = new HashMap(); - String outputTableName = null; - conf.set(IS_PIG_NON_PART_TABLE, "false"); - if(writeToPartitionedTable) { - outputTableName = NUMBERS_PARTITIONED_TABLE_NAME; - outputPartitionKvps.put("datestamp", "20100101"); - } else { - if(writeToNonPartPigTable) { - conf.set(IS_PIG_NON_PART_TABLE, "true"); - outputTableName = NUMBERS_NON_PARTITIONED_PIG_TABLE_NAME; - } else { - outputTableName = NUMBERS_NON_PARTITIONED_TABLE_NAME; - } - // test with null or empty randomly - if(new Random().nextInt(2) == 0) { - outputPartitionKvps = null; - } - } - - String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL); - if(principalID != null) - conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID); - Job job = new Job(conf, "storenumbers"); - - // initialize HCatInputFormat - HCatInputFormat.setInput(job, InputJobInfo.create( - dbName, tableName, null, serverUri, principalID)); - // initialize HCatOutputFormat - HCatOutputFormat.setOutput(job, OutputJobInfo.create( - dbName, outputTableName, outputPartitionKvps, serverUri, principalID)); - // test with and without specifying schema randomly - HCatSchema s = HCatInputFormat.getTableSchema(job); - if(writeToNonPartPigTable) { - List newHfsList = new ArrayList(); - // change smallint and tinyint to int - for(HCatFieldSchema hfs: s.getFields()){ - if(hfs.getTypeString().equals("smallint")) { - newHfsList.add(new HCatFieldSchema(hfs.getName(), - HCatFieldSchema.Type.INT, hfs.getComment())); - } else if(hfs.getTypeString().equals("tinyint")) { - newHfsList.add(new HCatFieldSchema(hfs.getName(), - HCatFieldSchema.Type.INT, hfs.getComment())); - } else { - newHfsList.add(hfs); - } - } - s = new HCatSchema(newHfsList); - } - HCatOutputFormat.setSchema(job, s); - - - job.setInputFormatClass(HCatInputFormat.class); - job.setOutputFormatClass(HCatOutputFormat.class); - job.setJarByClass(StoreNumbers.class); - job.setMapperClass(SumMapper.class); - job.setOutputKeyClass(IntWritable.class); - job.setNumReduceTasks(0); - job.setOutputValueClass(DefaultHCatRecord.class); - System.exit(job.waitForCompletion(true) ? 0 : 1); - } - - - /** - * - */ - private static void usage() { - System.err.println("Usage: hadoop jar storenumbers <-libjars hive-hcat jar>\n" + - "\tIf the second argument is \"part\" data is written to datestamp = '2010101' partition of " + - "the numbers_part_empty_initially table.\n\tIf the second argument is \"nopart\", data is written to " + - "the unpartitioned numbers_nopart_empty_initially table.\n\tIf the second argument is \"nopart_pig\", " + - "data is written to the unpartitioned numbers_nopart_pig_empty_initially table.\nt" + - "The hcat jar location should be specified as file://\n"); - System.exit(2); - - } - - -} Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/HCatTestDriver.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/HCatTestDriver.java (revision 1208047) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/HCatTestDriver.java (working copy) @@ -1,61 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hcatalog.utils; - -import org.apache.hcatalog.utils.TypeDataCheck; -import org.apache.hadoop.util.ProgramDriver; - -/** - * A description of an example program based on its class and a - * human-readable description. - */ -public class HCatTestDriver { - - public static void main(String argv[]){ - int exitCode = -1; - ProgramDriver pgd = new ProgramDriver(); - try { - pgd.addClass("typedatacheck", TypeDataCheck.class, - "A map/reduce program that checks the type of each field and" + - " outputs the entire table (to test hcat)."); - pgd.addClass("sumnumbers", SumNumbers.class, - "A map/reduce program that performs a group by on the first column and a " + - "SUM operation on the other columns of the \"numbers\" table."); - pgd.addClass("storenumbers", StoreNumbers.class, "A map/reduce program that " + - "reads from the \"numbers\" table and adds 10 to each fields and writes " + - "to the \"numbers_partitioned\" table into the datestamp=20100101 " + - "partition OR the \"numbers_empty_initially\" table based on a " + - "cmdline arg"); - pgd.addClass("storecomplex", StoreComplex.class, "A map/reduce program that " + - "reads from the \"complex\" table and stores as-is into the " + - "\"complex_empty_initially\" table."); - pgd.addClass("storedemo", StoreDemo.class, "demo prog."); - pgd.driver(argv); - - // Success - exitCode = 0; - } - catch(Throwable e){ - e.printStackTrace(); - } - - System.exit(exitCode); - } -} - Index: src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/HCatTypeCheckHive.java =================================================================== --- src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/HCatTypeCheckHive.java (revision 1208047) +++ src/test/e2e/hcatalog/udfs/java/org/apache/hcatalog/utils/HCatTypeCheckHive.java (working copy) @@ -1,140 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hcatalog.utils; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; - -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions; - -/** - * A hive udf to check types of the fields read from hcat. A sample hive query which can use this is: - * - * create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; - * select typecheck('map+struct+array>+int', - * mymap, mytuple, bagofmap, rownum) from complex; - * - * - * The first argument to the UDF is a string representing the schema of the columns in the table. - * The columns in the tables are the remaining args to it. - * The schema specification consists of the types as given by "describe
" - * with each column's type separated from the next column's type by a '+' - * - * The UDF will throw an exception (and cause the query to fail) if it does not - * encounter the correct types. - * - * The output is a string representation of the data , type and hive category. - * It is not advisable to use this against large dataset since the output would also - * be large. - * - */ -public final class HCatTypeCheckHive extends GenericUDF { - -ObjectInspector[] argOIs; -@Override -public Object evaluate(DeferredObject[] args) throws HiveException { - List row = new ArrayList(); - String typesStr = (String) getJavaObject(args[0].get(), argOIs[0], new ArrayList()); - String[] types = typesStr.split("\\+"); - for(int i = 0; i < types.length; i++) { - types[i] = types[i].toLowerCase(); - } - for(int i = 1; i < args.length; i++) { - ObjectInspector oi = argOIs[i]; - List categories = new ArrayList(); - Object o = getJavaObject(args[i].get(),oi, categories); - try { - if(o != null) { - Util.check(types[i-1], o); - } - } catch (IOException e) { - throw new HiveException(e); - } - row.add(o == null ? "null" : o); - row.add(":" + (o == null ? "null" : o.getClass()) + ":" + categories); - } - return row.toString(); -} - -private Object getJavaObject(Object o, ObjectInspector oi, List categories) { - if(categories != null) { - categories.add(oi.getCategory()); - } - if(oi.getCategory() == ObjectInspector.Category.LIST) { - List l = ((ListObjectInspector)oi).getList(o); - List result = new ArrayList(); - ObjectInspector elemOI = ((ListObjectInspector)oi).getListElementObjectInspector(); - for(Object lo : l) { - result.add(getJavaObject(lo, elemOI, categories)); - } - return result; - } else if (oi.getCategory() == ObjectInspector.Category.MAP) { - Map m = ((MapObjectInspector)oi).getMap(o); - Map result = new HashMap(); - ObjectInspector koi = ((MapObjectInspector)oi).getMapKeyObjectInspector(); - ObjectInspector voi = ((MapObjectInspector)oi).getMapValueObjectInspector(); - for(Entry e: m.entrySet()) { - result.put((String)getJavaObject(e.getKey(), koi, null), - (String)getJavaObject(e.getValue(), voi, null)); - } - return result; - - } else if (oi.getCategory() == ObjectInspector.Category.STRUCT) { - List s = ((StructObjectInspector)oi).getStructFieldsDataAsList(o); - List sf = ((StructObjectInspector)oi).getAllStructFieldRefs(); - List result = new ArrayList(); - for(int i = 0; i < s.size(); i++) { - result.add(getJavaObject(s.get(i), sf.get(i).getFieldObjectInspector(), categories)); - } - return result; - } else if(oi.getCategory() == ObjectInspector.Category.PRIMITIVE) { - return ((PrimitiveObjectInspector)oi).getPrimitiveJavaObject(o); - } - throw new RuntimeException("Unexpected error!"); -} - -@Override -public String getDisplayString(String[] arg0) { - return null; -} - -@Override -public ObjectInspector initialize(ObjectInspector[] argOIs) - throws UDFArgumentException { - this.argOIs = argOIs; - return ObjectInspectorFactory.getReflectionObjectInspector(String.class, - ObjectInspectorOptions.JAVA); -} - -} Index: src/test/e2e/hcatalog/tests/hive.conf =================================================================== --- src/test/e2e/hcatalog/tests/hive.conf (revision 0) +++ src/test/e2e/hcatalog/tests/hive.conf (revision 0) @@ -0,0 +1,117 @@ +#!/home/y/bin/perl + + # + # Do + # egrep '^#|name.*=>' hcat.conf | egrep -v '^#!|egrep' | less + # to get an outline of this test conf file + # + + # Has a couple of Hive set directives: + # set hive.exec.dynamic.partition.mode=nonstrict; + # set hive.exec.dynamic.partition=true; + + +$cfg = { + 'driver' => 'Hive', + 'groups' => [ + { + 'name' => 'Hive_Checkin', + 'tests' => [ { + 'num' => 1, + 'sql' => q\select * from studenttab10k;\, + 'floatpostprocess' => 1, + 'delimiter' => ' ', + }, + { + 'num' => 2, + 'sql' => q\drop table if exists checkin_2; + create table checkin_2 as select * from studenttab10k;\, + 'floatpostprocess' => 1, + 'delimiter' => ' ', + }, + { + 'num' => 3, + 'sql' => q\SELECT studenttab10k.* FROM studenttab10k JOIN votertab10k ON (studenttab10k.name = votertab10k.name);\, + 'floatpostprocess' => 1, + 'delimiter' => ' ', + }, + { + 'num' => 4, + 'sql' => q" + drop table if exists multi_insert_1_1; + drop table if exists multi_insert_1_2; + drop table if exists multi_insert_1_3; + + create table multi_insert_1_1 ( + name string, + ds string) + row format delimited + fields terminated by '\\t' + stored as textfile; + + create table multi_insert_1_2 ( + name string, + ds string) + row format delimited + fields terminated by '\\t' + stored as textfile; + + create table multi_insert_1_3 ( + name string, + ds string) + row format delimited + fields terminated by '\\t' + stored as textfile; + + from studentparttab30k + insert overwrite table multi_insert_1_1 + select name, ds + where ds = '20110924' + + insert overwrite table multi_insert_1_2 + select name, ds + where ds = '20110925' + + insert overwrite table multi_insert_1_3 + select name, ds + where ds = '20110926'; + ", + 'result_table' => ['multi_insert_1_1', + 'multi_insert_1_2', + 'multi_insert_1_3'], + 'verify_sql' =>["select name, ds + from studentparttab30k + where ds = '20110924';", + "select name, ds + from studentparttab30k + where ds = '20110925';", + "select name, ds + from studentparttab30k + where ds = '20110926';"] + } ] + }, # end g + { + 'name' => 'Hive_Read', + 'tests' => [ { + 'num' => 1, + 'sql' => q\select * from all100krc;\, + 'floatpostprocess' => 1, + 'delimiter' => ' ', + } ] + }, # end g + { + 'name' => 'Hive_Write', + 'tests' => [ { + 'num' => 1, + 'sql' => q\ +drop table if exists hive_write_1; +create table hive_write_1 (name string, age int, gpa double) stored as rcfile; +insert into TABLE hive_write_1 select * from all100krc;\, + 'result_table' => 'hive_write_1', + 'verify_sql' =>"select name, age, gpa from all100krc;", + 'floatpostprocess' => 1, + 'delimiter' => ' ', + } ] + } + ] +} Index: src/test/e2e/hcatalog/tests/hcat.conf =================================================================== --- src/test/e2e/hcatalog/tests/hcat.conf (revision 1208047) +++ src/test/e2e/hcatalog/tests/hcat.conf (working copy) @@ -13,2993 +13,126 @@ $cfg = { 'driver' => 'HCat', - -# 'run_as' => 'hadoopqa', - 'use-pig.pl' => 1, 'groups' => [ # This first group should be moved to deployer ? { - 'name' => 'hcat_setup_Hive_createTable', + 'name' => 'HCat_CreateTable', 'tests' => [ { 'num' => 1 - ,'hcat_cmdline_args' => ['-g', 'users', '-p', 'rwxrwx---'], - ,'hcat' => " -CREATE external TABLE IF NOT EXISTS numbers ( -INTNUM1000 int, -ID smallint, -INTNUM5 tinyint, -INTNUM100 int, -INTNUM int, -LONGNUM bigint, -FLOATNUM float, -DOUBLENUM double -) -stored as RCFile -location ':INPATH:/numbers'; -" + ,'hcat' => q\ +drop table if exists hcat_createtable_1; +create table hcat_createtable_1(name string, +age int, +gpa double) +stored as textfile; +describe hcat_createtable_1;\ ,'rc' => 0 + ,'expected_out_regex' => 'name string(\s)*age int(\s)*gpa double' + ,'expected_err_regex' => 'OK(.*)OK(.*)OK' }, { - 'num' => 6 - ,'hcat_cmdline_args' => ['-g', 'users', '-p', 'rwxrwx---'], - ,'hcat' => " -CREATE external TABLE IF NOT EXISTS complex ( - mymap map, - mytuple struct, - bagofmap array>, - rownum int -) -row format DELIMITED FIELDS TERMINATED BY '\001' - COLLECTION ITEMS TERMINATED BY '\002' - MAP KEYS TERMINATED BY '\003' -stored as rcfile -location ':INPATH:/complex'; -" + 'num' => 2 + ,'hcat' => q\ +drop table if exists hcat_createtable_2; +create table hcat_createtable_2(name string, +age int, +gpa double) partitioned by (b string) stored as TEXTFILE; +describe extended hcat_createtable_2; +\, ,'rc' => 0 + ,'expected_out_regex' => 'name string(\s)*age int(\s)*gpa double' }, - { - 'num' => 7 - ,'hcat_cmdline_args' => ['-g', 'users', '-p', 'rwxrwx---'], - ,'hcat' => " -CREATE external TABLE IF NOT EXISTS boolean_table ( -myflag boolean, -rownum int -) -row format DELIMITED FIELDS TERMINATED BY '\001' - COLLECTION ITEMS TERMINATED BY '\002' - MAP KEYS TERMINATED BY '\003' -stored as rcfile -location ':INPATH:/boolean'; -" + 'num' => 3 + ,'hcat' => q\ +drop table if exists hcat_create_table_3; +create table if not exists hcat_createtable_3(name string, age int, gpa double) stored as textfile; +create table if not exists hcat_createtable_3(name string, age int, gpa double) stored as textfile; +describe hcat_createtable_3; +\, ,'rc' => 0 + ,'expected_out_regex' => 'name string(\s)*age int(\s)*gpa double' + ,'expected_err_regex' => 'OK(.*)OK(.*)OK(.*)OK' }, - { - 'num' => 8 - # same as 'numbers' from above, just with 'int' for small/tiny ints. - ,'hcat_cmdline_args' => ['-g', 'users', '-p', 'rwxrwx---'], - ,'hcat' => " -CREATE external TABLE IF NOT EXISTS numbers_pig ( -INTNUM1000 int, -ID int, -- smallint, -INTNUM5 int, -- tinyint, -INTNUM100 int, -INTNUM int, -LONGNUM bigint, -FLOATNUM float, -DOUBLENUM double -) -stored as RCFile -location ':INPATH:/numbers_pig'; -" - ,'rc' => 0 - }, - ], }, # end g -################################################################################ -# HIVE STORED DATA -################################################################################ - -#------------------------------------------------------------------------------- -# Create partitioned test table using Hive -#------------------------------------------------------------------------------- { - 'name' => 'hcat_hive2hive_partitioned', - 'tests' => [ + 'name' => 'HCat_DropTable', + 'tests' => [ - { - 'num' => 1, - 'hive' => " -CREATE TABLE tmp_hive_partitioned_:RUNID: ( - id smallint, - intnum int, - floatnum float -) -partitioned by ( - idmod5 tinyint -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_hive_partitioned_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, + { + 'num' => 1 + ,'hcat' => q\ +create table if not exists hcat_droptable_1(name string, +age int, +gpa double) +stored as textfile; +drop table hcat_droptable_1; +describe hcat_droptable_1;\ + ,'expected_out_regex' => 'does not exist' + }, + { + 'num' => 2 + ,'hcat' => q\ +create table if not exists hcat_droptable_2(name string, +age int, +gpa double) +stored as textfile; +drop table if exists hcat_droptable_2; +describe hcat_droptable_2;\, + ,'rc' => 0 + ,'expected_out_regex' => 'does not exist' + }, - { - 'num' => 2, - 'depends_on' => 'hcat_hive2hive_partitioned_1', - 'hadoop' => "fs -ls :TMP:/hcat_tmp_tables", - 'rc' => 0, - 'expected_out_regex' => ":TMP:/hcat_tmp_tables/tmp_hive_partitioned_:RUNID:", - }, - - { - 'num' => 3, - 'depends_on' => 'hcat_hive2hive_partitioned_1', - 'hive' => "show tables;", - 'rc' => 0, - 'expected_out_regex' => "tmp_hive_partitioned_:RUNID:", - }, - - { - 'num' => 4, - # select from empty table - 'depends_on' => 'hcat_hive2hive_partitioned_1', - 'hive' => "select * from tmp_hive_partitioned_:RUNID:;", - 'rc' => 0, - 'expected_err_regex' => "OK", - 'expected_out' => "", - }, - - { - 'num' => 5, - # - 'depends_on' => 'hcat_hive2hive_partitioned_1', - 'hive' => " -set hive.exec.dynamic.partition.mode=nonstrict; -set hive.exec.dynamic.partition=true; - -insert overwrite table tmp_hive_partitioned_:RUNID: -partition (idmod5=1) -select id, intnum, floatnum -from numbers -where id % 5 = 1; - -insert overwrite table tmp_hive_partitioned_:RUNID: -partition (idmod5=2) -select id, intnum, floatnum -from numbers -where id % 5 = 2; -", - 'rc' => 0, - 'expected_err_regex' => "OK", - 'expected_out' => "", - }, - - { - 'num' => 6, - # - 'depends_on' => 'hcat_hive2hive_partitioned_5', - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'floatpostprocess' => 1, - 'delimiter' => ' ', - 'hive' => " -select idmod5, id, intnum, floatnum -from tmp_hive_partitioned_:RUNID: -; -", - 'sql' => " -select id % 5, id, intnum, floatnum -from numbers -where (id % 5 = 1) - or (id % 5 = 2) -; -", - }, - - ] # end tests - }, # end group - -#------------------------------------------------------------------------------- -# HIVE STORED -> HIVE SECTION -#------------------------------------------------------------------------------- - - { - 'name' => 'hcat_hive2hive_type_check', - 'tests' => [ - # for how the 'typecheck' udf works. - # If the the typecheck does not pass, the udf will abort. - # Look at the jobtracker for info about the actual value, - # e.g. something like: "Expected java.lang.String, got java.lang.Integer" - # The verification: - # 'expected_out_regex' => "class java", - # is needed because otherwise the tests might "PASS" because the output was empty... - - { - 'num' => 1, - 'hive' => q\ -add jar :FUNCPATH:/testudf.jar; -create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; -select typecheck('int+smallint+tinyint+int+int+bigint+float+double', -intnum1000,id,intnum5,intnum100,intnum,longnum,floatnum,doublenum) from numbers; -\, - 'rc' => 0, - 'expected_out_regex' => "class java", - 'expected_err_regex' => "OK", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 2, - 'hive' => q\ -add jar :FUNCPATH:/testudf.jar; -create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; -select typecheck('map+struct+array>+int', -mymap, mytuple, bagofmap, rownum) from complex; -\, - 'rc' => 0, - 'expected_out_regex' => "class java", - 'expected_err_regex' => "OK", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 3, - 'hive' => q\ -add jar :FUNCPATH:/testudf.jar; -create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; -select typecheck('boolean+int', -myflag,rownum) from boolean_table; -\, - 'rc' => 0, - 'expected_out_regex' => "class java", - 'expected_err_regex' => "OK", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 5, - 'floatpostprocess' => 1, - 'delimiter' => ' ', - 'hive' => q\ -select - id, -- expect smallint - intnum5 + id, -- expect smallint - intnum5 + id + intnum, -- expect int - intnum5 + id + intnum + longnum, -- expect bigint - intnum5 * id, -- expect smallint - intnum5 * id * intnum, -- expect int - intnum5 * id * intnum * longnum, -- expect bigint - intnum5 + 500, -- expect int - intnum5 + 1.5, -- expect float - cast(intnum5 + 1.5 as double), -- expect double - cast(intnum5 + 1.0 as int), -- expect int - floatnum + doublenum, -- expect double - floatnum * doublenum -- expect double -from numbers -order by id -limit 500 -; -\, - 'sql' => " -select - id, - intnum5 + id, - intnum5 + id + intnum, - intnum5 + id + intnum + longnum, - intnum5 * id, - intnum5 * id * intnum, - intnum5 * id * intnum * longnum, - intnum5 + 500, - intnum5 + 1.5, - cast(intnum5 + 1.5 as double precision), - cast(intnum5 + 1.0 as integer), - floatnum + doublenum, - floatnum * doublenum -from numbers -order by id -limit 500 -; -", - }, - - { - 'num' => 6, - 'ignore' => '!!! Hive truncates where Postgres rounds', - 'hive' => q\ -select - id, - intnum5, - cast(intnum5 + 1.9 as int) -from numbers -order by id -limit 5 -; -\, - 'sql' => " -select - id, - intnum5, - cast(intnum5 + 1.9 as integer) -from numbers -order by id -limit 5 -; -", - }, - - ] # end tests - }, # end group - - { - 'name' => 'hcat_hive2hive_diagnostic_operators', - 'tests' => [ - - { - 'num' => 1, - 'hive' => "show tables;", - 'expected_err_regex' => "OK", - }, - { - 'num' => 5, - 'hive' => "describe complex;", - 'sql' => "\\d", # dummy statement - }, - - { - 'num' => 6, - 'hive' => "describe complex.mytuple;", - 'expected_out_regex' => 'num\s+int\s+from\s+deserializer\s*\nstr\s+string\s+from\s+deserializer\s*\ndbl\s+double\s+from\s+deserializer', - }, - - { - 'num' => 7, - 'hive' => "describe complex.bagofmap;", - 'expected_out_regex' => 'bagofmap\s+array>\s+from deserializer', - }, - - { - 'num' => 8, - 'hive' => "describe boolean_table;", - 'sql' => "\\d", # dummy statement - }, - - { - 'num' => 9, - 'hive' => "describe boolean_table.myflag;", - 'expected_out_regex' => 'myflag\s+boolean\s+from deserializer', - }, - - ] # end tests - }, # end group - - { - 'name' => 'hcat_hive2hive_select', - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'tests' => [ - - { - 'num' => 1, - 'floatpostprocess' => 1, - 'delimiter' => ' ', - # actually no nulls in table, so had not needed the casting..., - 'hive' => q\ -select - if (intnum1000 IS NULL, "", cast(intnum1000 as STRING)), - if (id IS NULL, "", cast(id as STRING)), - if (intnum5 IS NULL, "", cast(intnum5 as STRING)), - if (intnum100 IS NULL, "", cast(intnum100 as STRING)), - if (intnum IS NULL, "", cast(intnum as STRING)), - if (longnum IS NULL, "", cast(longnum as STRING)), - if (floatnum IS NULL, "", cast(floatnum as STRING)), - if (doublenum IS NULL, "", cast(doublenum as STRING)) -from numbers; -\, - 'sql' => q\ -select - intnum1000, - id, - intnum5, - intnum100, - intnum, - longnum, - floatnum, - doublenum -from numbers; -\, - }, - - { - 'num' => 2, - 'hive' => q\ -select - mymap, - mytuple, - bagofmap, - rownum -from complex; -\, - 'sql' => "\\d", # dummy statement - }, - - { - 'num' => 3, - 'hive' => q\ -select - myflag, rownum -from boolean_table; -\, - 'sql' => "\\d", # dummy statement - }, - - - - { - 'num' => 8, - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'hive' => q\ -select - distinct - intnum1000, - intnum5, - intnum100, - intnum, - longnum -from numbers; -\, - 'sql' => q\ -select - distinct - intnum1000, - intnum5, - intnum100, - intnum, - longnum -from numbers; -\, - }, - - ] # end tests - }, # end group - - { - 'name' => 'hcat_hive2hive_lateral', - 'sortBenchmark' => 1, - 'sortResults' => 1, - # Had work-around for Hive problem - # set hive.optimize.cp=false; - 'tests' => [ - - # NOTE: - # The queries below written w/o WHERE clauses until this jira is resolved: - # https://issues.apache.org/jira/browse/HIVE-1056 - # http://wiki.apache.org/hadoop/Hive/LanguageManual/LateralView - - { - 'num' => 1, - 'hive' => q\ -select rownum, item -from complex LATERAL VIEW explode(bagofmap) explodedTable AS item; -\, - 'sql' => "\\d", # dummy statement - }, - - - { - 'num' => 3, - 'hive' => q\ -select - rownum, - item, - "item['a1']:", if (item['a1'] IS NULL, "", cast(item['a1'] as STRING)), - "item['a2']:", if (item['a2'] IS NULL, "", cast(item['a2'] as STRING)), - "item['k1']:", if (item['k1'] IS NULL, "", cast(item['k1'] as STRING)), - "item['k6']:", if (item['k6'] IS NULL, "", cast(item['k6'] as STRING)) -from complex LATERAL VIEW explode(bagofmap) explodedTable AS item -; -\, - 'sql' => "\\d", # dummy statement - }, - - - - ] # end tests - }, # end group - - { - 'name' => 'hcat_hive2hive_join', - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'tests' => [ - - { - 'num' => 1, - 'floatpostprocess' => 1, - 'delimiter' => ' ', - 'hive' => q\ -select - a.intnum1000, - b.intnum1000, - a.id, - b.id, - a.intnum5, - b.intnum5, - a.intnum100, - b.intnum100, - a.intnum, - b.intnum, - a.longnum, - b.longnum, - a.floatnum, - b.floatnum, - a.doublenum, - b.doublenum -from - numbers a - join numbers b - on (a.intnum5 = b.intnum5) -where - a.id < 30 - and b.id < 40 -order by a.intnum5 -; -\, - 'sql' => " -select - a.intnum1000, - b.intnum1000, - a.id, - b.id, - a.intnum5, - b.intnum5, - a.intnum100, - b.intnum100, - a.intnum, - b.intnum, - a.longnum, - b.longnum, - a.floatnum, - b.floatnum, - a.doublenum, - b.doublenum -from - numbers as a - join numbers as b - on (a.intnum5 = b.intnum5) -where - a.id < 30 - and b.id < 40 -order by a.intnum5 -; -", - }, - - { - 'num' => 2, - # join by map entry - # full outer join - 'hive' => q\ -select - a.rownum, - b.rownum, - a.mymap['mymapk1'], - b.mymap['mymapk1'], - a.mymap['mymapk3'], - b.mymap['mymapk3'], - a.mymap, - b.mymap -from - complex a - full outer join complex b - on (a.mymap['mymapk1'] - = b.mymap['mymapk3']) -; -\, - 'sql' => "\\d", # dummy statement - }, - - { - 'num' => 3, - # join by tuple item - # inner join - 'hive' => q\ -select - a.rownum, - b.rownum, - a.mytuple.dbl, - b.mytuple.dbl, - a.mytuple, - b.mytuple -from - complex a -join complex b -on (a.mytuple.dbl = b.mytuple.dbl + 1) -; -\, - 'sql' => "\\d", # dummy statement - }, - - - ] # end tests - }, # end group -# end group -#------------------------------------------------------------------------------- -# HIVE STORED -> PIG SECTION -#------------------------------------------------------------------------------- - + ], + }, # end g { - 'name' => 'hcat_hive2pig_load_describe', - # + 'name' => 'HCat_AlterTable', + 'tests' => [ - 'tests' => [ - - { - 'num' => 1, - 'pig' => " -a = load 'default.numbers_pig' using org.apache.hcatalog.pig.HCatLoader(); -describe a; -", - 'rc' => 0, - 'expected_out' => 'a: {intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double} -', - 'not_expected_err_regex' => "ERROR", - }, - - { - 'num' => 2, - 'pig' => " -a = load 'default.complex' using org.apache.hcatalog.pig.HCatLoader(); -describe a; -", - 'rc' => 0, - 'expected_out' => 'a: {mymap: map[],mytuple: (num: int,str: chararray,dbl: double),bagofmap: {(innerfield: map[])},rownum: int} -', - 'not_expected_err_regex' => "ERROR", - }, - - { - 'num' => 3, - 'ignore' => 'Pig does not understand boolean. Made a negative test for the error message.', - 'pig' => " -a = load 'default.boolean_table' using org.apache.hcatalog.pig.HCatLoader(); -describe a; -", - 'rc' => 0, - 'expected_out' => "", - 'not_expected_err_regex' => "ERROR", - }, - - ] # end tests - }, # end group + { + 'num' => 1 + ,'hcat' => q\ +drop table if exists hcat_altertable_1; +create table hcat_altertable_1(name string, age int, gpa double) partitioned by (b string) stored as textfile; +alter table hcat_altertable_1 add partition (b='2010-10-10'); +show partitions hcat_altertable_1;\ + ,'rc' => 0 + ,'expected_out_regex' => 'b=2010-10-10' + }, + { + 'num' => 2 + ,'hcat' => q\ +alter table hcat_altertable_1 add partition (b='2010-10-11'); +alter table hcat_altertable_1 drop partition (b='2010-10-10'); +show partitions hcat_altertable_1;\, + ,'rc' => 0 + ,'expected_out_regex' => 'b=2010-10-11' + ,'not_expected_out_regex' => 'b=2010-10-10' + }, + ], + }, # end g { - 'name' => 'hcat_hive2pig_type_check', - # - # * This UDF can be used to check that a tuple presented by org.apache.hcatalog.pig.HCatLoader has the - # * right types for the fields - # * (...) - # * The output should only contain the value '1' in all rows. (This UDF returns - # * the integer value 1 if all fields have the right type, else throws IOException) - - 'tests' => [ - - { - 'num' => 1, - 'pig' => " -register :FUNCPATH:/testudf.jar; -a = load 'default.numbers_pig' using org.apache.hcatalog.pig.HCatLoader(); -b = foreach a generate org.apache.hcatalog.utils.HCatTypeCheck('intnum1000:int,id:int,intnum5:int,intnum100:int,intnum:int,longnum:long,floatnum:float,doublenum:double', *); -store b into ':OUTPATH:'; -", - 'rc' => 0, - 'expected_err_regex' => "Success", - # 'expected_out_regex' => "1", - # 'not_expected_out_regex' => "[^1\\t]", - }, - - { - 'num' => 2, - 'pig' => " -register :FUNCPATH:/testudf.jar; -a = load 'default.complex' using org.apache.hcatalog.pig.HCatLoader(); -b = foreach a generate org.apache.hcatalog.utils.HCatTypeCheck('mymap: map[],mytuple: (num: int,str: chararray,dbl: double),bagofmap: {(innerfield: map[])},rownum: int', *); -store b into ':OUTPATH:'; -", - 'rc' => 0, - 'expected_err_regex' => "Success", - # 'expected_out_regex' => "1", - # 'not_expected_out_regex' => "[^1\\t]", - }, - - { - 'num' => 3, - 'pig' => " -register :FUNCPATH:/testudf.jar; -a = load 'default.boolean_table' using org.apache.hcatalog.pig.HCatLoader(); -b = foreach a generate org.apache.hcatalog.utils.HCatTypeCheck('boolean+int', *); -store b into ':OUTPATH:'; -", - 'rc' => 6, - 'expected_err_regex' => "ERROR", - # 'expected_out_regex' => "1", - 'not_expected_out_regex' => "Success", - }, - - - ] # end tests - }, # end group - { - 'name' => 'hcat_hive2pig_empty', - 'tests' => [ - - { - 'num' => 1, - 'hive' => " -CREATE TABLE tmp_hive_empty_:RUNID: ( - INTNUM1000 int, - ID int, - INTNUM5 int, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_hive_empty_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, - - { - 'num' => 2, - 'depends_on' => 'hcat_hive2pig_empty_1', - 'pig' => " -a = load 'default.tmp_hive_empty_:RUNID:' using org.apache.hcatalog.pig.HCatLoader(); -store a into ':OUTPATH:'; -", - 'rc' => 0, - 'expected_err_regex' => "Success", - 'not_expected_err_regex' => "ERROR", - 'expected_out' => "", - }, - - { - 'num' => 3, - 'depends_on' => 'hcat_hive2pig_empty_1', - 'pig' => " -a = load 'default.tmp_hive_empty_:RUNID:' using org.apache.hcatalog.pig.HCatLoader(); -store a into 'default.tmp_hive_empty_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('', - 'intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double'); -", - 'rc' => 0, - 'expected_err_regex' => "Success", - 'expected_out' => "", - }, - - ] # end tests - }, # end group - { - - 'name' => 'hcat_hive2pig_arithmetic_operators', - # covering any items from the "Arithmetic Operators and More" section not covered elswhere - 'delimiter' => ' ', - 'tests' => [ - - { - 'num' => 5, - 'sortResults' => 1, - 'sortBenchmark' => 1, - # COUNT star - 'pig' => q? -a = load 'default.numbers_pig' USING org.apache.hcatalog.pig.HCatLoader(); -b = group a all; -c = foreach b generate COUNT(a.$0); -store c into ':OUTPATH:'; -?, - 'rc' => 0, - 'expected_err_regex' => "Success!", - 'expected_stdout' => "5000", - }, - { - 'num' => 6, - # Group - 'sortBenchmark' => 1, - 'sortResults' => 1, - # 'a: {intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double}' - 'pig' => q? -a = load 'default.numbers_pig' USING org.apache.hcatalog.pig.HCatLoader(); -b = group a by intnum5; -c = foreach b generate group as i5, COUNT(a) as count_rows, SUM(a.id) as sum_id; -store c into ':OUTPATH:'; -?, - 'sql' => ' -select intnum5, COUNT(id), SUM(id) -from numbers -group by intnum5 -;', - }, - { - 'num' => 7, - # Order by - # 'a: {intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double}' - ########################################## - # - # a = load '/user/hadoopqa/pig/tests/data/txt/numbers.txt' using PigStorage(':') - # as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); - # b = group a by intnum5; - # c = foreach b generate group as i5, COUNT(a) as count_rows, SUM(a.id) as sum_id; - # d = order c by i5; - # dump d; - # - ########################################## - # - 'floatpostprocess' => 1, - # WE SHOULD REALLY NOT BE SORTING HERE, BUT WE CAN'T GET STABLE SORT OUT - # OF PIG AND POSTGRES IN THE SAME MANNER - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'pig' => q? -a = load 'default.numbers_pig' USING org.apache.hcatalog.pig.HCatLoader(); -b = order a by intnum5; -store b into ':OUTPATH:'; -?, - # 'rc' => 0, - 'sql' => 'select * from numbers order by intnum5 ;', - }, - - - { - 'num' => 10, - # 9.3.5.1 FILTER such that an expression of the form: - Part I - # FILTER alias by exp1 - # FILTER alias by exp2 - # FILTER alias by exp3 - # gives the same result as - # - # FILTER alias by exp1, exp2, expr3 - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'floatpostprocess' => 1, - 'delimiter' => ' ', - 'pig' => q? -a = load 'default.numbers_pig' USING org.apache.hcatalog.pig.HCatLoader(); -b = filter a by intnum1000 > 3000; -c = filter b by id > 2500; -d = filter c by intnum5 > 3; -e = filter d by intnum > 5050; -f = filter e by longnum > 5050; -g = filter f by floatnum > 683; -h = filter g by doublenum > 683; -store h into ':OUTPATH:'; -?, - 'sql' => " -select * from numbers -where - intnum1000 > 3000 -and id > 2500 -and intnum5 > 3 -and intnum > 5050 -and longnum > 5050 -and floatnum > 683 -and doublenum > 683 -; -", - }, - - { - 'num' => 11, - # 9.3.5.1 FILTER such that an expression of the form: - Part II - # FILTER alias by exp1 - # FILTER alias by exp2 - # FILTER alias by exp3 - # gives the same result as - # - # FILTER alias by exp1, exp2, expr3 - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'floatpostprocess' => 1, - 'delimiter' => ' ', - 'pig' => q? -a = load 'default.numbers_pig' USING org.apache.hcatalog.pig.HCatLoader(); -b = filter a by - intnum1000 > 3000 - and id > 2500 - and intnum5 > 3 - and intnum > 5050 - and longnum > 5050 - and floatnum > 683 - and doublenum > 683; -; -store b into ':OUTPATH:'; -?, - 'sql' => " -select * from numbers -where - intnum1000 > 3000 -and id > 2500 -and intnum5 > 3 -and intnum > 5050 -and longnum > 5050 -and floatnum > 683 -and doublenum > 683 -; -", - }, - ] # end tests - }, # end group - -#------------------------------------------------------------------------------- -# HIVE STORED -> HADOOP SECTION -#------------------------------------------------------------------------------- - - { - 'name' => 'hcat_hive2hadoop_read', - # From: - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'hadoop_classpath' => ':HCAT_JAR:', - 'tests' => [ - - { - 'num' => 1, - # sum numbers - # Using doctored benchmark file due to rounding error, e.g.: - # actual line : 5000 4500500 2949 416084 5050809 5050809 1083307.100 1083308.561 - # expected line: 5000 4500500 2949 416084 5050809 5050809 1083310.000 1083308.561 - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.SumNumbers thrift://:THRIFTSERVER: :OUTPATH: -libjars file://:HCAT_JAR: -\, - 'sql' => " -select - intnum1000, - sum (id), - sum (intnum5), - sum (intnum100), - sum (intnum), - sum (longnum), - sum (floatnum), - sum (doublenum) -from numbers -group by intnum1000 -order by intnum1000; -", - }, - - ] # end tests - }, # end group - - { - 'name' => 'hcat_hive2hadoop_type_check', - # From: - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'hadoop_classpath' => ':HCAT_JAR:', - 'tests' => [ - - { - 'num' => 1, - # typedatacheck complex - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.TypeDataCheck thrift://:THRIFTSERVER: complex 'map+struct+array>+int' :OUTPATH: tab -libjars file://:HCAT_JAR: -\, - 'sql' => "\\d;", # dummy - }, - - { - 'num' => 2, - # typedatacheck numbers - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.TypeDataCheck thrift://:THRIFTSERVER: numbers int+smallint+tinyint+int+int+bigint+float+double :OUTPATH: tab -libjars file://:HCAT_JAR: -\, - 'sql' => "select * from numbers;", - }, - - { - 'num' => 3, - # typedatacheck boolean_table - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.TypeDataCheck thrift://:THRIFTSERVER: boolean_table boolean+int :OUTPATH: tab -libjars file://:HCAT_JAR: -\, - 'sql' => "select * from boolean_table;", - }, - - ] # end tests - }, # end group - - { - 'name' => 'hcat_hive2hadoop_write_numbers_nopart', - 'hadoop_classpath' => ':HCAT_JAR:', - 'tests' => [ - - { - 'num' => 1, - 'hive' => " -drop table numbers_nopart_empty_initially; -", - # does not give error even if the table does not exist - 'expected_out' => "", - }, - - { - 'num' => 2, - 'depends_on' => 'hcat_hive2hadoop_write_numbers_nopart_1', - 'hive' => " -CREATE TABLE numbers_nopart_empty_initially ( - INTNUM1000 int, - ID smallint, - INTNUM5 tinyint, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -stored as RCFile -location ':TMP:/hcat_tmp_tables/numbers_nopart_empty_initially' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0 - }, - - { - 'num' => 3, - # storenumbers - # 'nopart' -> data is written to the numbers_nopart_empty_initially table. - 'depends_on' => 'hcat_hive2hadoop_write_numbers_nopart_2', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.StoreNumbers thrift://:THRIFTSERVER: nopart -libjars file://:HCAT_JAR: -\, - 'rc' => 0, - 'expected_out' => "", - 'expected_err_regex' => "HDFS_BYTES_WRITTEN=\\d+", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 4, - 'depends_on' => 'hcat_hive2hadoop_write_numbers_nopart_3', - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'hive' => " -select - intnum1000, - id, - intnum5, - intnum100, - intnum, - longnum, - floatnum, - doublenum -from numbers_nopart_empty_initially; -", - 'sql' => " -select - 10 + intnum1000, - 10 + id, - 10 + intnum5, - 10 + intnum100, - 10 + intnum, - 10 + longnum, - 10 + floatnum, - 10 + doublenum -from numbers; -", - }, - - # pig cannot handle tinyint and smallint, hence create a version of the table with 'int' instead - { - 'num' => 5, - 'hive' => " -drop table numbers_nopart_pig_empty_initially; -", - # does not give error even if the table does not exist - 'expected_out' => "", - }, - - { - 'num' => 6, - 'depends_on' => 'hcat_hive2hadoop_write_numbers_nopart_5', - 'hive' => " -CREATE TABLE numbers_nopart_pig_empty_initially ( - INTNUM1000 int, - ID int, - INTNUM5 int, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -stored as RCFile -location ':TMP:/hcat_tmp_tables/numbers_nopart_pig_empty_initially' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0 - }, - - { - 'num' => 7, - # storenumbers - # 'nopart' -> data is written to the numbers_nopart_pig_empty_initially table. - 'depends_on' => 'hcat_hive2hadoop_write_numbers_nopart_6', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.StoreNumbers thrift://:THRIFTSERVER: nopart_pig -libjars file://:HCAT_JAR: -\, - 'rc' => 0, - 'expected_out' => "", - 'expected_err_regex' => "HDFS_BYTES_WRITTEN=\\d+", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 8, - 'depends_on' => 'hcat_hive2hadoop_write_numbers_nopart_7', - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'hive' => " -select - intnum1000, - id, - intnum5, - intnum100, - intnum, - longnum, - floatnum, - doublenum -from numbers_nopart_pig_empty_initially; -", - 'sql' => " -select - 10 + intnum1000, - 10 + id, - 10 + intnum5, - 10 + intnum100, - 10 + intnum, - 10 + longnum, - 10 + floatnum, - 10 + doublenum -from numbers; -", - }, - - ] # end tests - }, # end group - - { - 'name' => 'hcat_hive2hadoop_write_numbers_part', - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'hadoop_classpath' => ':HCAT_JAR:', - 'tests' => [ - - { - 'num' => 1, - 'hive' => " -drop table numbers_part_empty_initially; -", - # does not give error even if the table does not exist - 'expected_out' => "", - }, - - { - 'num' => 2, - 'depends_on' => 'hcat_hive2hadoop_write_numbers_part_1', - 'hive' => " -CREATE TABLE numbers_part_empty_initially ( - INTNUM1000 int, - ID smallint, - INTNUM5 tinyint, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -partitioned by ( - datestamp string -) -stored as RCFile -location ':TMP:/hcat_tmp_tables/numbers_part_empty_initially' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0 - }, - - { - 'num' => 3, - # storenumbers - # 'part' -> data is written to datestamp = '20100101' partition of the numbers_part_empty_initially table. - 'depends_on' => 'hcat_hive2hadoop_write_numbers_part_2', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.StoreNumbers thrift://:THRIFTSERVER: part -libjars file://:HCAT_JAR: -\, - 'rc' => 0, - 'expected_out' => "", - 'expected_err_regex' => "HDFS_BYTES_WRITTEN=\\d+", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 4, - 'depends_on' => 'hcat_hive2hadoop_write_numbers_part_3', - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'hive' => " -select - intnum1000, - id, - intnum5, - intnum100, - intnum, - longnum, - floatnum, - doublenum, - datestamp -from numbers_part_empty_initially; -", - 'sql' => " -select - 10 + intnum1000, - 10 + id, - 10 + intnum5, - 10 + intnum100, - 10 + intnum, - 10 + longnum, - 10 + floatnum, - 10 + doublenum, - '20100101' -from numbers; -", - }, - - ] # end tests - }, # end group - - { - 'name' => 'hcat_hive2hadoop_write_complex_nopart', - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'hadoop_classpath' => ':HCAT_JAR:', - 'tests' => [ - - { - 'num' => 1, - 'hive' => " -drop table complex_nopart_empty_initially; -", - # does not give error even if the table does not exist - 'expected_out' => "", - }, - - { - 'num' => 2, - 'depends_on' => 'hcat_hive2hadoop_write_complex_nopart_1', - 'hive' => " -CREATE TABLE complex_nopart_empty_initially ( - mymap map, - mytuple struct, - bagofmap array>, - rownum int -) -stored as rcfile -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0 - }, - - { - 'num' => 3, - # storecomplex - 'depends_on' => 'hcat_hive2hadoop_write_complex_nopart_2', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.StoreComplex thrift://:THRIFTSERVER: -libjars file://:HCAT_JAR: -\, - 'rc' => 0, - 'expected_out' => "", - 'expected_err_regex' => "HDFS_BYTES_WRITTEN=\\d+", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 4, - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'depends_on' => 'hcat_hive2hadoop_write_complex_nopart_3', - 'hive' => " -select - mymap, - mytuple, - bagofmap, - rownum -from complex_nopart_empty_initially; -", - 'sql' => "\\d;", # dummy - }, - - ] # end tests - }, # end group - - - - -################################################################################ -# PIG STORED DATA -################################################################################ -# -# The bootstrap creates tables and loads data using Hive. -# Here tables with identical schemas are created and data stored to them using Pig. -# The tables are then verified and used by each of pig, hive, and hadoop. -# -# NOTE: The hcat_pig2pig_setup_tables group must run before the rest of the -# "pig stored data" groups. -# -################################################################################ - -#------------------------------------------------------------------------------- -# Prepare test tables using PIG + other 'store' tests -#------------------------------------------------------------------------------- - - # Currently loading from txt "using PigStorage(':') ..." - - { - 'name' => 'hcat_pig2pig_setup_tables', - # From: - 'tests' => [ - - { - 'num' => 1, - # numbers stored by pig - 'hive' => " -CREATE TABLE tmp_pig2pig_stored_numbers_:RUNID: ( - INTNUM1000 int, - ID int, -- smallint, - INTNUM5 int, -- tinyint, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_pig2pig_stored_numbers_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, - - { - 'num' => 2, # GOOD - 'depends_on' => 'hcat_pig2pig_setup_tables_1', - # currently not loading from Hive - 'pig' => q\ -A = load ':INPATH:/txt/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); -store A into 'default.tmp_pig2pig_stored_numbers_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('', - 'intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double'); -\, - 'rc' => 0, - }, - - { - 'num' => 3, - # complex stored by pig - # 'depends_on' => 'hcat_pig2pig_setup_tables_2', # not really, but #4 does - 'hive' => " -CREATE TABLE tmp_pig2pig_stored_complex_:RUNID: ( - mymap map, - mytuple struct, - bagofmap array>, - rownum int -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_pig2pig_stored_complex_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, - - { - 'num' => 4, - 'depends_on' => 'hcat_pig2pig_setup_tables_3', - # currently not loading from Hive - 'pig' => q\ -A = load 'default.complex' using org.apache.hcatalog.pig.HCatLoader(); -store A into 'default.tmp_pig2pig_stored_complex_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('', - 'mymap: map[],mytuple: (num: int,str: chararray,dbl: double),bagofmap: {(innerfield: map[])},rownum: int'); -\, - 'rc' => 0, - }, - - { - 'num' => 5, - # boolean_table stored by pig - 'hive' => " -CREATE TABLE tmp_pig2pig_stored_boolean_table_:RUNID: ( - myflag boolean, - rownum int -) -row format DELIMITED FIELDS TERMINATED BY '\001' - COLLECTION ITEMS TERMINATED BY '\002' - MAP KEYS TERMINATED BY '\003' -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_pig2pig_stored_boolean_table_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, - - { - 'num' => 6, - 'depends_on' => 'hcat_pig2pig_setup_tables_5', - 'ignore' => 'Pig does not understnad boolean', - 'pig' => q\ -A = load 'default.boolean_table' using org.apache.hcatalog.pig.HCatLoader(); -store A into 'default.tmp_pig2pig_stored_boolean_table_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('', - 'boolean:string,rownum:integer'); -\, - 'rc' => 0, - }, - - ] # end tests - - }, # end group - - { - 'name' => 'hcat_pig2pig_more_store', - 'tests' => [ - - { - 'num' => 1, - 'hive' => " -CREATE TABLE tmp_pig2pig_store_table_1_:RUNID: ( - INTNUM1000 int, - ID int, -- smallint, - INTNUM5 int, -- tinyint, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_pig2pig_store_table_1_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, - - { - 'num' => 2, - 'depends_on' => 'hcat_pig2pig_more_store_1', - 'hive' => " -CREATE TABLE tmp_pig2pig_store_table_2_:RUNID: ( - INTNUM1000 int, - ID int, -- smallint, - INTNUM5 int, -- tinyint, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_pig2pig_store_table_2_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, - - { - 'num' => 3, - 'depends_on' => 'hcat_pig2pig_more_store_2', - 'pig' => q\ -A = load ':INPATH:/txt/numbers.txt' using PigStorage(':') as (intnum1000,id,intnum5,intnum100,intnum,longnum,floatnum,doublenum); --- A = load 'default.numbers_pig' using org.apache.hcatalog.pig.HCatLoader(); -B = foreach A generate (int)intnum1000, (int)id, (int)intnum5, (int)intnum100, (int)intnum, (long)longnum, (float)floatnum, (double)doublenum; -C = filter B by id < 2000; -D = filter B by id >= 2000; --- store to 1st table -store C into 'default.tmp_pig2pig_store_table_1_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('', - 'intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double'); --- store to 2nd table -store D into 'default.tmp_pig2pig_store_table_2_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('', - 'intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double'); -\, - 'rc' => 0, - 'expected_err_regex' => "Success", - }, - - { - 'num' => 4, - # to verify data in two tables written above - 'depends_on' => 'hcat_pig2pig_more_store_3', - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'hive' => " -select id, intnum from tmp_pig2pig_store_table_1_:RUNID: -", - 'sql' => " -select id, intnum -from numbers -where id < 2000 -; -", - }, - - { - 'num' => 5, - # to verify data in two tables written above - 'depends_on' => 'hcat_pig2pig_more_store_3', - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'hive' => " -select id, intnum from tmp_pig2pig_store_table_2_:RUNID: -", - 'sql' => " -select id, intnum -from numbers -where id >= 2000 -; -", - }, - -# ADD HCatStorer tests for: -# * not specifying schema -# - schema from HCatLoad ===> DONE -# - schema from load with PigStorage and 'as' ===> DONE -# - schema from 'generate' with types ===> DONE -# * not specifying anything -# - as above plus non-partitioned -# * adding columns -# ERROR tests: -# * missing columns - { - 'num' => 6, - # setup step - 'hive' => " -CREATE TABLE tmp_pig2pig_store_numbers_partitioned_:RUNID: ( - INTNUM1000 int, - ID int, --smallint, - INTNUM5 int, --tinyint, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -partitioned by ( - STR1 string, - STR2 string -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/pig2pig_store_numbers_partitioned_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, - - { - 'num' => 9, - 'depends_on' => 'hcat_pig2pig_more_store_6', - # Store into a new partition with org.apache.hcatalog.pig.HCatStorer - # Store without specifying schema, schema from PigStorage 'as' - 'pig' => q\ -a = load ':INPATH:/txt/numbers.txt' using PigStorage(':') - as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); -store a into 'default.tmp_pig2pig_store_numbers_partitioned_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('STR1=str1part1,STR2=str2part1'); -\, - 'rc' => 0, - }, - - { - 'num' => 10, - 'depends_on' => 'hcat_pig2pig_more_store_9', - # verify the above - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'pig' => q\ -a = load 'default.tmp_pig2pig_store_numbers_partitioned_:RUNID:' using org.apache.hcatalog.pig.HCatLoader(); -b = filter a BY ( - str1 == 'str1part1' and - str2 == 'str2part1' - ); -store b into ':OUTPATH:'; -\, - 'sql' => q\ -select *, 'str1part1', 'str2part1' from numbers; -\, - }, - - { - 'num' => 11, - 'depends_on' => 'hcat_pig2pig_more_store_6', - # Store into a new partition with org.apache.hcatalog.pig.HCatStorer - # Store without specifying schema, schema from PigStorage 'as' - 'pig' => q\ -a = load ':INPATH:/txt/numbers.txt' using PigStorage(':') - as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); -store a into 'default.tmp_pig2pig_store_numbers_partitioned_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('STR1=str1part2,STR2=str2part2'); -\, - 'rc' => 0, - }, - - { - 'num' => 12, - 'depends_on' => 'hcat_pig2pig_more_store_11', - # verify the above - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'pig' => q\ -a = load 'default.tmp_pig2pig_store_numbers_partitioned_:RUNID:' using org.apache.hcatalog.pig.HCatLoader(); -b = filter a BY ( - str1 == 'str1part1' and - str2 == 'str2part1' - ); -store b into ':OUTPATH:'; -\, - 'sql' => q\ -select *, 'str1part1', 'str2part1' from numbers; -\, - }, - - { - 'num' => 13, - 'depends_on' => 'hcat_pig2pig_more_store_6', - # Store into a new partition with org.apache.hcatalog.pig.HCatStorer - # Store without specifying schema, schema from 'generate' with types - 'pig' => q\ -a = load ':INPATH:/txt/numbers.txt' using PigStorage(':') as (intnum1000, id, intnum5, intnum100, intnum, longnum, floatnum, doublenum); -b = foreach a generate (int)intnum1000, (int)id, (int)intnum5, (int)intnum100, (int)intnum, (long)longnum, (float)floatnum, (double)doublenum; -store b into 'default.tmp_pig2pig_store_numbers_partitioned_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('STR1=str1part3,STR2=str2part3'); -\, - 'rc' => 0, - }, - - { - 'num' => 14, - 'depends_on' => 'hcat_pig2pig_more_store_13', - # verify the above - 'sortBenchmark' => 1, - 'sortResults' => 1, - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'pig' => q\ -a = load 'default.tmp_pig2pig_store_numbers_partitioned_:RUNID:' using org.apache.hcatalog.pig.HCatLoader(); -b = filter a BY ( - str1 == 'str1part2' and - str2 == 'str2part2' - ); -store b into ':OUTPATH:'; -\, - 'sql' => q\ -select *, 'str1part2', 'str2part2' from numbers; -\, - }, - - - ] # end tests - - }, # end group - -#------------------------------------------------------------------------------- -# PIG STORED -> HIVE SECTION -# Not a likely use case, commented out for time being -#------------------------------------------------------------------------------- - - #NaUC# { - #NaUC# 'name' => 'hcat_pig2hive_type_check', - #NaUC# 'tests' => [ - #NaUC# - #NaUC# { - #NaUC# 'num' => 1, - #NaUC# 'hive' => q\ - #NaUC# add jar :FUNCPATH:/testudf.jar; - #NaUC# create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; - #NaUC# select typecheck('int+smallint+tinyint+int+int+bigint+float+double', - #NaUC# intnum1000,id,intnum5,intnum100,intnum,longnum,floatnum,doublenum) from tmp_pig2pig_stored_numbers_:RUNID:; - #NaUC# \, - #NaUC# 'rc' => 0, - #NaUC# 'expected_out_regex' => "class java", - #NaUC# 'expected_err_regex' => "OK", - #NaUC# 'not_expected_err_regex' => "FAILED", - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 2, - #NaUC# 'hive' => q\ - #NaUC# add jar :FUNCPATH:/testudf.jar; - #NaUC# create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; - #NaUC# select typecheck('map+struct+array>+int', - #NaUC# mymap, mytuple, bagofmap, rownum) from tmp_pig2pig_stored_complex_:RUNID:; - #NaUC# \, - #NaUC# 'rc' => 0, - #NaUC# 'expected_out_regex' => "class java", - #NaUC# 'expected_err_regex' => "OK", - #NaUC# 'not_expected_err_regex' => "FAILED", - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 3, - #NaUC# 'hive' => q\ - #NaUC# add jar :FUNCPATH:/testudf.jar; - #NaUC# create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; - #NaUC# select typecheck('boolean+int', - #NaUC# myflag,rownum) from tmp_pig2pig_stored_boolean_table_:RUNID:; - #NaUC# \, - #NaUC# 'rc' => 0, - #NaUC# 'expected_out_regex' => "class java", - #NaUC# 'expected_err_regex' => "OK", - #NaUC# 'not_expected_err_regex' => "FAILED", - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 4, - #NaUC# 'hive' => q\ - #NaUC# add jar :FUNCPATH:/testudf.jar; - #NaUC# create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; - #NaUC# select typecheck('string+string+string+string+int+string+string+string+string+string+string+string+string+string+map+map+array>+string+string+string+string', - #NaUC# bcookie, - #NaUC# src_spaceid, - #NaUC# srcpvid, - #NaUC# ts, - #NaUC# browser, - #NaUC# bckt, - #NaUC# type, - #NaUC# ip, - #NaUC# yuid, - #NaUC# referrer, - #NaUC# pg_spaceid, - #NaUC# dstid, - #NaUC# dstpvid, - #NaUC# dst_spaceid, - #NaUC# page_params, - #NaUC# clickinfo, - #NaUC# viewinfo, - #NaUC# datestamp, - #NaUC# srcid, - #NaUC# action, - #NaUC# testid - #NaUC# ) - #NaUC# where - #NaUC# datestamp = '20091102' - #NaUC# and srcid = '19174' - #NaUC# and browser = 3 - #NaUC# and src_spaceid = '2114728002' - #NaUC# limit 15 - #NaUC# ; - #NaUC# \, - #NaUC# 'rc' => 0, - #NaUC# 'expected_out_regex' => "class java", - #NaUC# 'expected_err_regex' => "OK", - #NaUC# 'not_expected_err_regex' => "FAILED", - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 5, - #NaUC# 'floatpostprocess' => 1, - #NaUC# 'delimiter' => ' ', - #NaUC# 'hive' => q\ - #NaUC# select - #NaUC# id, -- expect smallint - #NaUC# intnum5 + id, -- expect smallint - #NaUC# intnum5 + id + intnum, -- expect int - #NaUC# intnum5 + id + intnum + longnum, -- expect bigint - #NaUC# intnum5 * id, -- expect smallint - #NaUC# intnum5 * id * intnum, -- expect int - #NaUC# intnum5 * id * intnum * longnum, -- expect bigint - #NaUC# intnum5 + 500, -- expect int - #NaUC# intnum5 + 1.5, -- expect float - #NaUC# cast(intnum5 + 1.5 as double), -- expect double - #NaUC# cast(intnum5 + 1.0 as int), -- expect int - #NaUC# floatnum + doublenum, -- expect double - #NaUC# floatnum * doublenum -- expect double - #NaUC# from tmp_pig2pig_stored_numbers_:RUNID: - #NaUC# order by id - #NaUC# limit 500 - #NaUC# ; - #NaUC# \, - #NaUC# 'sql' => " - #NaUC# select - #NaUC# id, - #NaUC# intnum5 + id, - #NaUC# intnum5 + id + intnum, - #NaUC# intnum5 + id + intnum + longnum, - #NaUC# intnum5 * id, - #NaUC# intnum5 * id * intnum, - #NaUC# intnum5 * id * intnum * longnum, - #NaUC# intnum5 + 500, - #NaUC# intnum5 + 1.5, - #NaUC# cast(intnum5 + 1.5 as double precision), - #NaUC# cast(intnum5 + 1.0 as integer), - #NaUC# floatnum + doublenum, - #NaUC# floatnum * doublenum - #NaUC# from numbers - #NaUC# order by id - #NaUC# limit 500 - #NaUC# ; - #NaUC# ", - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 6, - #NaUC# 'ignore' => '!!! Hive truncates where Postgres rounds', - #NaUC# 'hive' => q\ - #NaUC# select - #NaUC# intnum5, - #NaUC# cast(intnum5 + 1.9 as int), - #NaUC# from tmp_pig2pig_stored_numbers_:RUNID: - #NaUC# order by id - #NaUC# limit 5 - #NaUC# ; - #NaUC# \, - #NaUC# 'sql' => " - #NaUC# select - #NaUC# intnum5, - #NaUC# cast(intnum5 + 1.9 as integer), - #NaUC# from numbers - #NaUC# order by id - #NaUC# limit 5 - #NaUC# ; - #NaUC# ", - #NaUC# }, - #NaUC# - #NaUC# ] # end tests - #NaUC# }, # end group - #NaUC# - #NaUC# { - #NaUC# 'name' => 'hcat_pig2hive_diagnostic_operators', - #NaUC# 'tests' => [ - #NaUC# - #NaUC# #NaUC# - #NaUC# { - #NaUC# 'num' => 5, - #NaUC# 'hive' => "describe tmp_pig2pig_stored_complex_:RUNID:;", - #NaUC# 'sql' => "\\d", # dummy statement - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 6, - #NaUC# 'ignore' => 'Open question about expected output', - #NaUC# 'hive' => "describe tmp_pig2pig_stored_complex_:RUNID:.mytuple;", - #NaUC# 'rc' => 0, - #NaUC# 'expected_out_regex' => "mytuple\tstruct\tfrom deserializer", - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 7, - #NaUC# 'hive' => "describe tmp_pig2pig_stored_complex_:RUNID:.bagofmap;", - #NaUC# 'rc' => 0, - #NaUC# 'expected_out_regex' => "bagofmap\tarray>\tfrom deserializer", - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 8, - #NaUC# 'hive' => "describe tmp_pig2pig_stored_boolean_table_:RUNID:;", - #NaUC# 'sql' => "\\d", # dummy statement - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 9, - #NaUC# 'hive' => "describe tmp_pig2pig_stored_boolean_table_:RUNID:.myflag;", - #NaUC# 'rc' => 0, - #NaUC# 'expected_out_regex' => "myflag\tboolean\tfrom deserializer", - #NaUC# }, - #NaUC# - #NaUC# ] # end tests - #NaUC# }, # end group - #NaUC# - #NaUC# { - #NaUC# 'name' => 'hcat_pig2hive_select', - #NaUC# 'sortBenchmark' => 1, - #NaUC# 'sortResults' => 1, - #NaUC# 'tests' => [ - #NaUC# - #NaUC# { - #NaUC# 'num' => 1, - #NaUC# 'floatpostprocess' => 1, - #NaUC# 'delimiter' => ' ', - #NaUC# # actually no nulls in table, so had not needed the casting..., - #NaUC# 'hive' => q\ - #NaUC# select - #NaUC# if (intnum1000 IS NULL, "", cast(intnum1000 as STRING)), - #NaUC# if (id IS NULL, "", cast(id as STRING)), - #NaUC# if (intnum5 IS NULL, "", cast(intnum5 as STRING)), - #NaUC# if (intnum100 IS NULL, "", cast(intnum100 as STRING)), - #NaUC# if (intnum IS NULL, "", cast(intnum as STRING)), - #NaUC# if (longnum IS NULL, "", cast(longnum as STRING)), - #NaUC# if (floatnum IS NULL, "", cast(floatnum as STRING)), - #NaUC# if (doublenum IS NULL, "", cast(doublenum as STRING)) - #NaUC# from tmp_pig2pig_stored_numbers_:RUNID:; - #NaUC# \, - #NaUC# 'sql' => q\ - #NaUC# select - #NaUC# intnum1000, - #NaUC# id, - #NaUC# intnum5, - #NaUC# intnum100, - #NaUC# intnum, - #NaUC# longnum, - #NaUC# floatnum, - #NaUC# doublenum - #NaUC# from numbers; - #NaUC# \, - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 2, - #NaUC# 'hive' => q\ - #NaUC# select - #NaUC# mymap, - #NaUC# mytuple, - #NaUC# bagofmap, - #NaUC# rownum - #NaUC# from tmp_pig2pig_stored_complex_:RUNID:; - #NaUC# \, - #NaUC# 'sql' => "\\d", # dummy statement - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 3, - #NaUC# 'hive' => q\ - #NaUC# select - #NaUC# myflag, rownum - #NaUC# from tmp_pig2pig_stored_boolean_table_:RUNID:; - #NaUC# \, - #NaUC# 'sql' => "\\d", # dummy statement - #NaUC# }, - #NaUC# - #NaUC# #NaUC# #NaUC# - #NaUC# #NaUC# - #NaUC# #NaUC# { - #NaUC# 'num' => 8, - #NaUC# 'sortBenchmark' => 1, - #NaUC# 'sortResults' => 1, - #NaUC# 'hive' => q\ - #NaUC# select - #NaUC# distinct - #NaUC# intnum1000, - #NaUC# intnum5, - #NaUC# intnum100, - #NaUC# intnum, - #NaUC# longnum - #NaUC# from tmp_pig2pig_stored_numbers_:RUNID:; - #NaUC# \, - #NaUC# 'sql' => q\ - #NaUC# select - #NaUC# distinct - #NaUC# intnum1000, - #NaUC# intnum5, - #NaUC# intnum100, - #NaUC# intnum, - #NaUC# longnum - #NaUC# from numbers; - #NaUC# \, - #NaUC# }, - #NaUC# - #NaUC# ] # end tests - #NaUC# }, # end group - #NaUC# - #NaUC# { - #NaUC# 'name' => 'hcat_pig2hive_lateral', - #NaUC# 'sortBenchmark' => 1, - #NaUC# 'sortResults' => 1, - #NaUC# 'tests' => [ - #NaUC# - #NaUC# # NOTE: - #NaUC# # The queries below written w/o WHERE clauses until this jira is resolved: - #NaUC# # https://issues.apache.org/jira/browse/HIVE-1056 - #NaUC# # http://wiki.apache.org/hadoop/Hive/LanguageManual/LateralView - #NaUC# - #NaUC# { - #NaUC# 'num' => 1, - #NaUC# 'hive' => q\ - #NaUC# select rownum, item - #NaUC# from tmp_pig2pig_stored_complex_:RUNID: - #NaUC# LATERAL VIEW explode(bagofmap) explodedTable AS item; - #NaUC# \, - #NaUC# 'sql' => "\\d", # dummy statement - #NaUC# }, - #NaUC# - #NaUC# #NaUC# { - #NaUC# 'num' => 3, - #NaUC# 'hive' => q\ - #NaUC# select - #NaUC# rownum, - #NaUC# item, - #NaUC# "item['a1']:", if (item['a1'] IS NULL, "", cast(item['a1'] as STRING)), - #NaUC# "item['a2']:", if (item['a2'] IS NULL, "", cast(item['a2'] as STRING)), - #NaUC# "item['k1']:", if (item['k1'] IS NULL, "", cast(item['k1'] as STRING)), - #NaUC# "item['k6']:", if (item['k6'] IS NULL, "", cast(item['k6'] as STRING)) - #NaUC# from complex LATERAL VIEW explode(bagofmap) explodedTable AS item - #NaUC# ; - #NaUC# \, - #NaUC# 'sql' => "\\d", # dummy statement - #NaUC# }, - #NaUC# - #NaUC# #NaUC# - #NaUC# ] # end tests - #NaUC# }, # end group - #NaUC# - #NaUC# { - #NaUC# 'name' => 'hcat_pig2hive_join', - #NaUC# 'sortBenchmark' => 1, - #NaUC# 'sortResults' => 1, - #NaUC# 'tests' => [ - #NaUC# - #NaUC# { - #NaUC# 'num' => 1, - #NaUC# 'floatpostprocess' => 1, - #NaUC# 'delimiter' => ' ', - #NaUC# 'hive' => q\ - #NaUC# select - #NaUC# a.intnum1000, - #NaUC# b.intnum1000, - #NaUC# a.id, - #NaUC# b.id, - #NaUC# a.intnum5, - #NaUC# b.intnum5, - #NaUC# a.intnum100, - #NaUC# b.intnum100, - #NaUC# a.intnum, - #NaUC# b.intnum, - #NaUC# a.longnum, - #NaUC# b.longnum, - #NaUC# a.floatnum, - #NaUC# b.floatnum, - #NaUC# a.doublenum, - #NaUC# b.doublenum - #NaUC# from - #NaUC# tmp_pig2pig_stored_numbers_:RUNID: a - #NaUC# join numbers b - #NaUC# on (a.intnum5 = b.intnum5) - #NaUC# where - #NaUC# a.id < 30 - #NaUC# and b.id < 40 - #NaUC# order by a.intnum5 - #NaUC# ; - #NaUC# \, - #NaUC# 'sql' => " - #NaUC# select - #NaUC# a.intnum1000, - #NaUC# b.intnum1000, - #NaUC# a.id, - #NaUC# b.id, - #NaUC# a.intnum5, - #NaUC# b.intnum5, - #NaUC# a.intnum100, - #NaUC# b.intnum100, - #NaUC# a.intnum, - #NaUC# b.intnum, - #NaUC# a.longnum, - #NaUC# b.longnum, - #NaUC# a.floatnum, - #NaUC# b.floatnum, - #NaUC# a.doublenum, - #NaUC# b.doublenum - #NaUC# from - #NaUC# numbers as a - #NaUC# join numbers as b - #NaUC# on (a.intnum5 = b.intnum5) - #NaUC# where - #NaUC# a.id < 30 - #NaUC# and b.id < 40 - #NaUC# order by a.intnum5 - #NaUC# ; - #NaUC# ", - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 2, - #NaUC# # join by map entry - #NaUC# # full outer join - #NaUC# 'hive' => q\ - #NaUC# select - #NaUC# a.rownum, - #NaUC# b.rownum, - #NaUC# a.mymap['mymapk1'], - #NaUC# b.mymap['mymapk1'], - #NaUC# a.mymap['mymapk3'], - #NaUC# b.mymap['mymapk3'], - #NaUC# a.mymap, - #NaUC# b.mymap - #NaUC# from - #NaUC# tmp_pig2pig_stored_complex_:RUNID: a - #NaUC# full outer join complex b - #NaUC# on (a.mymap['mymapk1'] - #NaUC# = b.mymap['mymapk3']) - #NaUC# ; - #NaUC# \, - #NaUC# 'sql' => "\\d", # dummy statement - #NaUC# }, - #NaUC# - #NaUC# { - #NaUC# 'num' => 3, - #NaUC# # join by tuple item - #NaUC# # inner join - #NaUC# 'hive' => q\ - #NaUC# select - #NaUC# a.rownum, - #NaUC# b.rownum, - #NaUC# a.mytuple.dbl, - #NaUC# b.mytuple.dbl, - #NaUC# a.mytuple, - #NaUC# b.mytuple - #NaUC# from - #NaUC# tmp_pig2pig_stored_complex_:RUNID: a - #NaUC# join complex b - #NaUC# on (a.mytuple.dbl = b.mytuple.dbl + 1) - #NaUC# ; - #NaUC# \, - #NaUC# 'sql' => "\\d", # dummy statement - #NaUC# }, - #NaUC# - #NaUC# ] # end tests - #NaUC# }, # end group - #NaUC# - #NaUC# -#------------------------------------------------------------------------------- -# PIG STORED -> PIG SECTION -#------------------------------------------------------------------------------- - - { - 'name' => 'hcat_pig2pig_type_check', - # Using udf from: - # - # * This UDF can be used to check that a tuple presented by org.apache.hcatalog.pig.HCatLoader has the - # * right types for the fields - # * (...) - # * The output should only contain the value '1' in all rows. (This UDF returns - # * the integer value 1 if all fields have the right type, else throws IOException) - - 'tests' => [ - - { - 'num' => 1, - 'pig' => " -register :FUNCPATH:/testudf.jar; -a = load 'default.tmp_pig2pig_stored_numbers_:RUNID:' using org.apache.hcatalog.pig.HCatLoader(); -b = foreach a generate org.apache.hcatalog.utils.HCatTypeCheck('intnum1000:int,id:int,intnum5:int,intnum100:int,intnum:int,longnum:long,floatnum:float,doublenum:double', *); -store b into ':OUTPATH:'; -", - 'rc' => 0, - 'expected_err_regex' => "Success", - # 'expected_out_regex' => "1", - # 'not_expected_out_regex' => "[^1\\t]", - }, - - { - 'num' => 2, - 'pig' => " -register :FUNCPATH:/testudf.jar; -a = load 'default.tmp_pig2pig_stored_complex_:RUNID:' using org.apache.hcatalog.pig.HCatLoader(); -b = foreach a generate org.apache.hcatalog.utils.HCatTypeCheck('mymap: map[],mytuple: (num: int,str: chararray,dbl: double),bagofmap: {(innerfield: map[])},rownum: int', *); -store b into ':OUTPATH:'; -", - 'rc' => 0, - 'expected_err_regex' => "Success", - # 'expected_out_regex' => "1", - # 'not_expected_out_regex' => "[^1\\t]", - }, - - { - 'num' => 3, - 'ignore' => 'pig does not understand boolean', - 'pig' => " -register :FUNCPATH:/testudf.jar; -a = load 'default.tmp_pig2pig_stored_boolean_table_:RUNID:' using org.apache.hcatalog.pig.HCatLoader(); -b = foreach a generate org.apache.hcatalog.utils.HCatTypeCheck('boolean+int', *); -store b into ':OUTPATH:'; -", - 'rc' => 0, - 'expected_err_regex' => "Success", - # 'expected_out_regex' => "1", - # 'not_expected_out_regex' => "[^1\\t]", - }, - - ] # end tests - }, # end group - -#------------------------------------------------------------------------------- -# PIG STORED -> HADOOP SECTION -#------------------------------------------------------------------------------- - - { - 'name' => 'hcat_pig2hadoop_read', - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'hadoop_classpath' => ':HCAT_JAR:', - 'tests' => [ - - # This tests is covered under hcat_hive2hadoop_read_1 - # { - # 'num' => 1, - # # sum numbers - # 'ignore' => '"sumnumbers" is hard coded to run on "numbers", which is created by Hive, not pig.', - # 'floatpostprocess' => 0, - # 'delimiter' => ' ', - # 'hadoop' => q\ - #jar :FUNCPATH:/testudf.jar sumnumbers thrift://:THRIFTSERVER: :OUTPATH: -libjars file://:HCAT_JAR: - #\, - # 'sql' => " - #select - # intnum1000, - # sum (id), - # sum (intnum5), - # sum (intnum100), - # sum (intnum), - # sum (longnum), - # sum (floatnum), - # sum (doublenum) - #from numbers - #group by intnum1000 - #order by intnum1000; - #", - # }, - - ] # end tests - }, # end group - - { - 'name' => 'hcat_pig2hadoop_type_check', - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'hadoop_classpath' => ':HCAT_JAR:', - 'tests' => [ - - { - 'num' => 1, - # typedatacheck complex -# aborts !!! - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.TypeDataCheck thrift://:THRIFTSERVER: tmp_pig2pig_stored_complex_:RUNID: 'map+struct+array>+int' :OUTPATH: tab -libjars file://:HCAT_JAR: -\, - 'sql' => "select * from complex;", - }, - - { - 'num' => 2, - # typedatacheck numbers - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.TypeDataCheck thrift://:THRIFTSERVER: tmp_pig2pig_stored_numbers_:RUNID: int+int+int+int+int+bigint+float+double :OUTPATH: tab -libjars file://:HCAT_JAR: -\, - 'sql' => "select * from numbers;", - }, - - { - 'num' => 3, - # typedatacheck boolean_table - 'ignore' => 'pig does not understand boolean', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.TypeDataCheck thrift://:THRIFTSERVER: tmp_pig2pig_stored_boolean_table_:RUNID: boolean+int :OUTPATH: tab -libjars file://:HCAT_JAR: -\, - 'sql' => "select * from boolean_table;", - }, - - ] # end tests - }, # end group - - -################################################################################ -# HADOOP STORED DATA -################################################################################ -# -# The bootstrap creates tables and loads data using Hive. -# Here tables with identical schemas are created and data stored to them using Hadoop. -# The tables are then verified and used by each of pig, hive, and hadoop. -# -# NOTE: The hcat_hive2hadoop_setup_tables group must run before the rest of the -# "hadoop stored data" groups. -# -################################################################################ - -#------------------------------------------------------------------------------- -# Prepare test tables using Hadoop -#------------------------------------------------------------------------------- - -# This replicates the drop, create and writing parts from the hcat_hive2hadoop_... tests. -# This should ideally be moved to the bootstrap conf, but would require a change in table name. - - { - 'name' => 'hcat_hive2hadoop_setup_tables', - 'sortResults' => 1, - 'sortBenchmark' => 1, - 'hadoop_classpath' => ':HCAT_JAR:', - 'tests' => [ - - { - 'num' => 1, - 'hive' => " -drop table numbers_nopart_empty_initially; -", - # does not give error even if the table does not exist - 'expected_out' => "", - }, - - { - 'num' => 2, - 'depends_on' => 'hcat_hive2hadoop_setup_tables_1', - 'hive' => " -CREATE TABLE numbers_nopart_empty_initially ( - INTNUM1000 int, - ID smallint, - INTNUM5 tinyint, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -stored as RCFile -location ':TMP:/hcat_tmp_tables/numbers_nopart_empty_initially' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0 - }, - - { - 'num' => 3, - # storenumbers - # 'nopart' -> data is written to the numbers_nopart_empty_initially table. - 'depends_on' => 'hcat_hive2hadoop_setup_tables_2', - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.StoreNumbers thrift://:THRIFTSERVER: nopart -libjars file://:HCAT_JAR: -\, - 'rc' => 0, - 'expected_out' => "", - 'expected_err_regex' => "HDFS_BYTES_WRITTEN=\\d+", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 4, - 'hive' => " -drop table numbers_part_empty_initially; -", - # does not give error even if the table does not exist - 'expected_out' => "", - }, - - { - 'num' => 5, - 'depends_on' => 'hcat_hive2hadoop_setup_tables_4', - 'hive' => " -CREATE TABLE numbers_part_empty_initially ( - INTNUM1000 int, - ID smallint, - INTNUM5 tinyint, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -partitioned by ( - datestamp string -) -stored as RCFile -location ':TMP:/hcat_tmp_tables/numbers_part_empty_initially' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0 - }, - - { - 'num' => 6, - # storenumbers - # 'part' -> data is written to datestamp = '20100101' partition of the numbers_part_empty_initially table. - 'depends_on' => 'hcat_hive2hadoop_setup_tables_5', - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.StoreNumbers thrift://:THRIFTSERVER: part -libjars file://:HCAT_JAR: -\, - 'rc' => 0, - 'expected_out' => "", - 'expected_err_regex' => "HDFS_BYTES_WRITTEN=\\d+", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 7, - 'hive' => " -drop table complex_nopart_empty_initially; -", - # does not give error even if the table does not exist - 'expected_out' => "", - }, - - { - 'num' => 8, - 'depends_on' => 'hcat_hive2hadoop_setup_tables_7', - 'hive' => " -CREATE TABLE complex_nopart_empty_initially ( - mymap map, - mytuple struct, - bagofmap array>, - rownum int -) -stored as rcfile -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0 - }, - - { - 'num' => 9, - # storecomplex - 'depends_on' => 'hcat_hive2hadoop_setup_tables_8', - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.StoreComplex thrift://:THRIFTSERVER: -libjars file://:HCAT_JAR: -\, - 'rc' => 0, - 'expected_out' => "", - 'expected_err_regex' => "HDFS_BYTES_WRITTEN=", - 'not_expected_err_regex' => "FAILED", - }, - - # !!! Add tests reading the just stored data - - ] # end tests - }, # end group - -#------------------------------------------------------------------------------- -# HADOOP STORED -> HIVE SECTION -#------------------------------------------------------------------------------- - - { - 'name' => 'hcat_hadoop2hive_type_check', - 'tests' => [ - # for how the 'typecheck' udf works. - # If the the typecheck does not pass, the udf will abort. - # Look at the jobtracker for info about the actual value, - # e.g. something like: "Expected java.lang.String, got java.lang.Integer" - # The verification: - # 'expected_out_regex' => "class java", - # is needed because otherwise the tests might "PASS" because the output was empty... - - { - 'num' => 1, - 'hive' => q\ -add jar :FUNCPATH:/testudf.jar; -create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; -select typecheck('int+smallint+tinyint+int+int+bigint+float+double', -intnum1000,id,intnum5,intnum100,intnum,longnum,floatnum,doublenum) from numbers_nopart_empty_initially; -\, - 'rc' => 0, - 'expected_out_regex' => "class java", - 'expected_err_regex' => "OK", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 2, - 'depends_on' => 'hcat_hive2hadoop_setup_tables_6', - 'hive' => q\ -add jar :FUNCPATH:/testudf.jar; -create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; -select typecheck('int+smallint+tinyint+int+int+bigint+float+double+string', -intnum1000,id,intnum5,intnum100,intnum,longnum,floatnum,doublenum,datestamp) from numbers_part_empty_initially; -\, - 'rc' => 0, - 'expected_out_regex' => "class java", - 'expected_err_regex' => "OK", - 'not_expected_err_regex' => "FAILED", - }, - - { - 'num' => 3, - 'hive' => q\ -add jar :FUNCPATH:/testudf.jar; -create temporary function typecheck as 'org.apache.hcatalog.utils.HCatTypeCheckHive'; -select typecheck('map+struct+array>+int', -mymap, mytuple, bagofmap, rownum) from complex_nopart_empty_initially; -\, - 'rc' => 0, - 'expected_out_regex' => "class java", - 'expected_err_regex' => "OK", - 'not_expected_err_regex' => "FAILED", - }, - - ] # end tests - }, # end group - -#------------------------------------------------------------------------------- -# HADOOP STORED -> PIG SECTION -#------------------------------------------------------------------------------- - - { - 'name' => 'hcat_hadoop2pig_type_check', - # * This UDF can be used to check that a tuple presented by org.apache.hcatalog.pig.HCatLoader has the - # * right types for the fields - # * (...) - # * The output should only contain the value '1' in all rows. (This UDF returns - # * the integer value 1 if all fields have the right type, else throws IOException) - - 'tests' => [ - - { - 'num' => 1, - 'pig' => " -register :FUNCPATH:/testudf.jar; -a = load 'default.numbers_nopart_pig_empty_initially' using org.apache.hcatalog.pig.HCatLoader(); -b = foreach a generate org.apache.hcatalog.utils.HCatTypeCheck('intnum1000:int,id:int,intnum5:int,intnum100:int,intnum:int,longnum:long,floatnum:float,doublenum:double', *); -store b into ':OUTPATH:'; -", - 'rc' => 0, - 'expected_err_regex' => "Success", - # 'expected_out_regex' => "1", - # 'not_expected_out_regex' => "[^1\\t]", - }, - - # default.numbers_part_empty_initially has smallint/tinyint which cannot be handled by pig - # { - # 'num' => 2, - # 'pig' => " - #register :FUNCPATH:/testudf.jar; - #a = load 'default.numbers_part_empty_initially' using org.apache.hcatalog.pig.HCatLoader(); - #b = foreach a generate org.apache.hcatalog.utils.HCatTypeCheck('intnum1000:int,id:int,intnum5:int,intnum100:int,intnum:int,longnum:long,floatnum:float,doublenum:double+string', *); - #store b into ':OUTPATH:'; - #", - # 'rc' => 0, - # 'expected_err_regex' => "Success", - # # 'expected_out_regex' => "1", - # # 'not_expected_out_regex' => "[^1\\t]", - # }, - - { - 'num' => 3, - 'pig' => " -register :FUNCPATH:/testudf.jar; -a = load 'default.complex_nopart_empty_initially' using org.apache.hcatalog.pig.HCatLoader(); -b = foreach a generate org.apache.hcatalog.utils.HCatTypeCheck('m:map[],t:tuple(num:int,str:chararray,dbl:double),bg:bag{t:tuple(m:map[])},i:int', *); -store b into ':OUTPATH:'; -", - 'rc' => 0, - 'expected_err_regex' => "Success", - # 'expected_out_regex' => "1", - # 'not_expected_out_regex' => "[^1\\t]", - }, - - ] # end tests - }, # end group - -#------------------------------------------------------------------------------- -# HADOOP STORED -> HADOOP SECTION -#------------------------------------------------------------------------------- - - { - 'name' => 'hcat_hadoop2hadoop_type_check', - # * This UDF can be used to check that a tuple presented by org.apache.hcatalog.pig.HCatLoader has the - # * right types for the fields - # * (...) - # * The output should only contain the value '1' in all rows. (This UDF returns - # * the integer value 1 if all fields have the right type, else throws IOException) - - 'tests' => [ - - { - 'num' => 1, - # typedatacheck complex_nopart_empty_initially - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.TypeDataCheck thrift://:THRIFTSERVER: complex_nopart_empty_initially 'map+struct+array>+int' :OUTPATH: tab -libjars file://:HCAT_JAR: -\, - 'sql' => "select * from complex;", - }, - - { - 'num' => 2, - # typedatacheck numbers_nopart_empty_initially - 'depends_on' => 'hcat_hive2hadoop_write_numbers_nopart_3', - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.TypeDataCheck thrift://:THRIFTSERVER: numbers_nopart_empty_initially int+smallint+tinyint+int+int+bigint+float+double :OUTPATH: tab -libjars file://:HCAT_JAR: -\, - 'sql' => "select intnum1000+10, id+10, intnum5+10, intnum100 + 10, intnum+10, longnum+10, floatnum+10, doublenum+10 from numbers;", - }, - - { - 'num' => 3, - # typedatacheck numbers_part_empty_initially - 'floatpostprocess' => 0, - 'delimiter' => ' ', - 'hadoop' => q\ -jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.TypeDataCheck thrift://:THRIFTSERVER: numbers_part_empty_initially int+smallint+tinyint+int+int+bigint+float+double+string :OUTPATH: tab -libjars file://:HCAT_JAR: -\, - 'sql' => "select intnum1000+10, id+10, intnum5+10, intnum100 + 10, intnum+10, longnum+10, floatnum+10, doublenum+10 , 20100101 from numbers;", - }, - - ] # end tests - }, # end group - -################################################################################ -# NEGATIVE -################################################################################ - -#------------------------------------------------------------------------------- -# Negative: hive -#------------------------------------------------------------------------------- - - { - 'name' => 'hcat_negative_hive', - 'tests' => [ - - { - 'num' => 1, - # Describe invalid Table - 'hive' => " -describe non_existing_table; -", - 'rc' => 9, # what is expected ? !!! - 'expected_err_regex' => "FAILED", - 'not_expected_err_regex' => "OK", - }, - - { - 'num' => 2, - # CREATE a table name that already exists, step 1 - 'hive' => " -CREATE TABLE tmp_name_collision_:RUNID: ( - ID int, - STR string -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_name_collision_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, - - { - 'num' => 3, - # CREATE a table name that already exists, step 2 - 'depends_on' => 'hcat_negative_hive_2', - 'hive' => " -CREATE TABLE tmp_name_collision_:RUNID: ( - ID lint, - STR string -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_name_collision_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 11, # what is expected ? !!! - 'expected_err_regex' => "FAILED", - 'not_expected_err_regex' => "OK", - }, - - { - 'num' => 4, - # SELECT with invalid column - 'hive' => " -select id, no_such_column from numbers; -", - 'rc' => 10, # what is expected ? !!! - # 'expected_err_regex' => "FAILED", - 'expected_err_regex' => "FAILED: Error in semantic analysis: Line 2:11 Invalid table alias or column reference 'no_such_column'", - 'not_expected_err_regex' => "OK", - - }, - - ] # end tests - - }, # end group -#------------------------------------------------------------------------------- -# Negative: pig -#------------------------------------------------------------------------------- - - { - 'name' => 'hcat_negative_pig2pig', - 'tests' => [ - - { - 'num' => 1, - # setup step - 'hive' => " -CREATE TABLE tmp_two_partitions_:RUNID: ( - INTNUM1000 int, - ID smallint, - INTNUM5 tinyint, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double -) -partitioned by ( - STR1 string, - STR2 string -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_two_partitions_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, - - { - 'num' => 2, - # setup step - 'hive' => " -CREATE TABLE tmp_non_partitioned_:RUNID: ( - INTNUM1000 int, - ID smallint, - INTNUM5 tinyint, - INTNUM100 int, - INTNUM int, - LONGNUM bigint, - FLOATNUM float, - DOUBLENUM double, - STR1 string, - STR2 string -) -stored as rcfile -location ':TMP:/hcat_tmp_tables/tmp_non_partitioned_:RUNID:' -TBLPROPERTIES ( - 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', - 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' -); -", - 'rc' => 0, - }, - - { - 'num' => 3, - 'depends_on' => 'hcat_negative_pig2pig_store_1', - # attempt to give partitions in writing to non-partitioned table - 'pig' => q\ -A = load ':INPATH:/txt/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); -store A into 'default.tmp_non_partitioned_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('STR1=A,STR2=B', - 'intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double'); -\, - 'rc' => 6, # what is expected ? !!! - 'expected_out' => "", - 'not_expected_err_regex' => "OK", - 'expected_err_regex' => "ERROR", - # ERROR 2116: Unexpected error. Could not validate the output specification for: default.tmp_non_partitioned_1280729182 - # and in secondary log: - # Caused by: java.io.IOException: Invalid partition values specified, table tmp_non_partitioned_1280729182 has 0 partition keys) - }, - - { - 'num' => 4, - 'depends_on' => 'hcat_negative_pig2pig_store_2', - # attempt to give no partitions in writing to partitioned table - 'pig' => q\ -A = load ':INPATH:/txt/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); -store A into 'default.tmp_non_partitioned_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('', - 'intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double'); -\, - 'rc' => 6, # what is expected ? !!! - 'expected_out' => "", - 'not_expected_err_regex' => "OK", - 'expected_err_regex' => "ERROR", - }, - - { - 'num' => 5, - # attempt storing to non-existing table - 'depends_on' => 'hcat_negative_pig2pig_store_1', - 'pig' => q\ -A = load ':INPATH:/txt/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); -store A into 'default.no_such_table' - using org.apache.hcatalog.pig.HCatStorer - ('STR1=A,STR2=B', - 'intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double'); -\, - 'rc' => 6, # what is expected ? !!! - 'expected_out' => "", - 'not_expected_err_regex' => "OK", - 'expected_err_regex' => "ERROR", - # In secondary log: - # Caused by: NoSuchObjectException(message:default.no_such_table table not found) - }, - - { - 'num' => 6, - 'depends_on' => 'hcat_negative_pig2pig_store_1', - # attempt loading from non-existing table - 'pig' => q\ -A = load ':INPATH:/txt/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); -store A into 'default.no_such_table' - using org.apache.hcatalog.pig.HCatStorer - ('STR1=A,STR2=B', - 'intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double'); -\, - 'rc' => 6, # what is expected ? !!! - 'expected_out' => "", - 'not_expected_err_regex' => "OK", - 'expected_err_regex' => "ERROR", - }, - - { - 'num' => 7, - 'depends_on' => 'hcat_negative_pig2pig_store_2', - # attempt to write to existing partition - 'pig' => q\ -A = load ':INPATH:/txt/numbers.txt' using PigStorage(':') as (intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double); -store A into 'default.tmp_two_partitions_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('STR1=A,STR2=B', - 'intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double'); -store A into 'default.tmp_two_partitions_:RUNID:' - using org.apache.hcatalog.pig.HCatStorer - ('STR1=A,STR2=B', - 'intnum1000: int,id: int,intnum5: int,intnum100: int,intnum: int,longnum: long,floatnum: float,doublenum: double'); -\, - 'rc' => 6, # what is expected ? !!! - 'expected_out' => "", - 'not_expected_err_regex' => "OK", - 'expected_err_regex' => "ERROR", - # Secondary log has: - # Caused by: java.io.IOException: No partition key value provided for key str1 of table tmp_two_partitions_1280774139 - }, - - { - 'num' => 8, - # filter on non-existing column - 'pig' => " -a = load 'default.numbers_pig' using org.apache.hcatalog.pig.HCatLoader(); -b = foreach a generate non_such, id, intnum; -store b into ':OUTPATH:'; -", - 'rc' => 6, - 'expected_out' => "", - 'not_expected_err_regex' => "OK", - 'expected_err_regex' => "Invalid", - }, - - { - 'num' => 9, - # filter on non-existing column - 'pig' => " -a = load 'default.numbers_pig' using org.apache.hcatalog.pig.HCatLoader(); -b = filter a by ((non_such == intnum) or (non_such != intnum)); -store b into ':OUTPATH:'; -", - 'rc' => 6, - 'expected_out' => "", - 'not_expected_err_regex' => "OK", - 'expected_err_regex' => "Invalid", - }, - - { - 'num' => 10, - # Pig does not understand boolean - 'pig' => " -a = load 'default.boolean_table' using org.apache.hcatalog.pig.HCatLoader(); -describe a; -", - 'rc' => 6, - 'expected_out' => "", - 'not_expected_err_regex' => "OK", - 'expected_err_regex' => "ERROR 1115: HCatalog column type 'BOOLEAN' is not supported in Pig as a column type", - }, - - { - 'num' => 11, - # Pig does not understand boolean - 'pig' => " -register :FUNCPATH:/testudf.jar; -a = load 'default.boolean_table' using org.apache.hcatalog.pig.HCatLoader(); -b = foreach a generate org.apache.hcatalog.utils.HCatTypeCheck('boolean+int', *); -store b into ':OUTPATH:'; -", - 'rc' => 6, - 'expected_err_regex' => "ERROR 1115: HCatalog column type 'BOOLEAN' is not supported in Pig as a column type", - 'expected_out' => "", - }, - - ] # end tests - - }, # end group - - ] # end groups + 'name' => 'HCat_Misc', + 'tests' => [ + { + 'num' => 1 + ,'hcat' => q\ +show databases;\ + ,'rc' => 0 + ,'expected_out_regex' => 'default' + }, + { + 'num' => 2 + ,'hcat' => q\ +show tables;\, + ,'rc' => 0 + ,'expected_out_regex' => 'studenttab10k' + }, + ], + }, # end g + ] } -; - Index: src/test/e2e/hcatalog/tests/pig.conf =================================================================== --- src/test/e2e/hcatalog/tests/pig.conf (revision 0) +++ src/test/e2e/hcatalog/tests/pig.conf (revision 0) @@ -0,0 +1,173 @@ +#!/home/y/bin/perl + + # + # Do + # egrep '^#|name.*=>' hcat.conf | egrep -v '^#!|egrep' | less + # to get an outline of this test conf file + # + + # Has a couple of Hive set directives: + # set hive.exec.dynamic.partition.mode=nonstrict; + # set hive.exec.dynamic.partition=true; + + +$cfg = { + 'driver' => 'Pig', + 'groups' => [ +# This first group should be moved to deployer ? + { + 'name' => 'Pig_Checkin', + 'tests' => [ + + { + 'num' => 1 + ,'hcat_prep'=>q\drop table if exists pig_checkin_1; +create table pig_checkin_1 (name string, age int, gpa double) STORED AS TEXTFILE;\ + ,'pig' => q\a = load 'studenttab10k' using org.apache.hcatalog.pig.HCatLoader(); +store a into 'pig_checkin_1' using org.apache.hcatalog.pig.HCatStorer();\, + ,'result_table' => 'pig_checkin_1' + ,'sql' => q\select * from studenttab10k;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 2 + ,'pig' => q\a = load 'studenttab10k' using org.apache.hcatalog.pig.HCatLoader(); +b = load 'votertab10k' using org.apache.hcatalog.pig.HCatLoader(); +c = join a by name, b by name; +store c into ':OUTPATH:';\, + ,'sql' => [ 'select s.name, s.age, gpa, v.name, v.age, registration, contributions from studenttab10k s join votertab10k v on (s.name = v.name);'] + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 3 + ,'pig' => q\a = load 'studenttab10k' using org.apache.hcatalog.pig.HCatLoader(); +b = load ':INPATH:/votertab10k' as (name:chararray, age:int, registration:chararray, contributions:float); +c = join a by name, b by name; +store c into ':OUTPATH:';\ + ,'sql' => q\select s.name, s.age, gpa, v.name, v.age, registration, contributions from studenttab10k s join votertab10k v on (s.name = v.name);\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 4 + ,'hcat_prep'=>q\drop table if exists pig_checkin_4_1; +drop table if exists pig_checkin_4_2; +create table pig_checkin_4_1 (name string, age int, gpa double) STORED AS TEXTFILE; +create table pig_checkin_4_2 (name string, age int, gpa double) STORED AS TEXTFILE;\ + ,'pig' => q\a = load 'studenttab10k' using org.apache.hcatalog.pig.HCatLoader(); +split a into b if age <=40, c if age > 40; +store b into 'pig_checkin_4_1' using org.apache.hcatalog.pig.HCatStorer(); +store c into 'pig_checkin_4_2' using org.apache.hcatalog.pig.HCatStorer();\, + ,'result_table' => ['pig_checkin_4_1','pig_checkin_4_2'] + ,'sql' => [ 'select * from studenttab10k where age<=40;', 'select * from studenttab10k where age>40;'] + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 5 + ,'hcat_prep'=>q\drop table if exists pig_checkin_5; +create table pig_checkin_5 (name string, age int, gpa double) STORED AS TEXTFILE;\ + ,'pig' => q\a = load 'studenttab10k' using org.apache.hcatalog.pig.HCatLoader(); +split a into b if age <=40, c if age > 40; +store b into 'pig_checkin_5' using org.apache.hcatalog.pig.HCatStorer(); +store c into ':OUTPATH:';\, + ,'result_table' => ['pig_checkin_5','?'] + ,'sql' => [ 'select * from studenttab10k where age<=40;', 'select * from studenttab10k where age>40;'] + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + + ], + }, # end g + { + 'name' => 'Pig_Read', + 'tests' => [ + + { + 'num' => 1 + ,'pig' => q\a = load 'all100k' using org.apache.hcatalog.pig.HCatLoader(); +store a into ':OUTPATH:';\, + ,'sql' => q\select * from all100k;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 2 + ,'pig' => q\a = load 'all100kjson' using org.apache.hcatalog.pig.HCatLoader(); +b = foreach a generate s, i, d; +store b into ':OUTPATH:';\, + ,'sql' => q\select s, i, d from all100kjson;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 3 + ,'pig' => q\a = load 'all100krc' using org.apache.hcatalog.pig.HCatLoader(); +store a into ':OUTPATH:';\, + ,'sql' => q\select * from all100krc;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + } + ], + }, # end g + { + 'name' => 'Pig_Write', + 'tests' => [ + { + 'num' => 1 + ,'hcat_prep'=>q\drop table if exists pig_write_1; +create table pig_write_1(t tinyint,si smallint,i int,b bigint,bool boolean,f float,d double,s string) stored as rcfile;\ + ,'pig' => q\a = load ':INPATH:/all100k' using PigStorage(':') as (t:int,si:int,i:int,b:int,bo:boolean,f:float,d:double,s:chararray); +store a into 'pig_write_1' using org.apache.hcatalog.pig.HCatStorer();\, + ,'result_table' => 'pig_write_1' + ,'sql' => q\select * from all100k;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 2 + ,'hcat_prep'=>q\drop table if exists pig_write_2; +create table pig_write_2( + s string, + i int, + d double, + m map, + bb array>) + STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat' + INPUTDRIVER 'org.apache.hcatalog.pig.drivers.LoadFuncBasedInputDriver' OUTPUTDRIVER 'org.apache.hcatalog.pig.drivers.StoreFuncBasedOutputDriver' + TBLPROPERTIES ('hcat.pig.loader'='org.apache.pig.builtin.JsonLoader', 'hcat.pig.storer'='org.apache.pig.builtin.JsonStorage', 'hcat.pig.loader.args'= +'s:chararray, i:int, d:double, m:map[chararray], bb:{t:(a:int, b:chararray)}', 'hcat.pig.args.delimiter'=' '); +\ + ,'pig' => q\a = load 'all100kjson' using org.apache.hcatalog.pig.HCatLoader(); +b = foreach a generate s, i, d; +store b into ':OUTPATH:';\, + ,'sql' => q\select IFNULL(s, ""), IFNULL(i, ""), IFNULL(d, "") from all100kjson;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 3 + ,'hcat_prep'=>q\drop table if exists pig_write_3; +create table pig_write_3( + name string, + age int, + gpa double) +stored as rcfile +TBLPROPERTIES ( + 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', + 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' +); +\ + ,'pig' => q\a = load 'all100krc' using org.apache.hcatalog.pig.HCatLoader(); +store a into ':OUTPATH:';\, + ,'sql' => q\select * from all100krc;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + } + ], + }, # end g + + ] +} Index: src/test/e2e/hcatalog/tests/hadoop.conf =================================================================== --- src/test/e2e/hcatalog/tests/hadoop.conf (revision 0) +++ src/test/e2e/hcatalog/tests/hadoop.conf (revision 0) @@ -0,0 +1,161 @@ +#!/home/y/bin/perl + + # + # Do + # egrep '^#|name.*=>' hcat.conf | egrep -v '^#!|egrep' | less + # to get an outline of this test conf file + # + + # Has a couple of Hive set directives: + # set hive.exec.dynamic.partition.mode=nonstrict; + # set hive.exec.dynamic.partition=true; + + +$cfg = { + 'driver' => 'Hadoop', + 'groups' => [ +# This first group should be moved to deployer ? + { + 'name' => 'Hadoop_Checkin', + 'tests' => [ + { + 'num' => 1 + ,'hadoop' => q\ +jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.SimpleRead -libjars :HCAT_JAR: :THRIFTSERVER: studenttab10k :OUTPATH: +\, + ,'sql' => q\select name, gpa from studenttab10k;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 2 + ,'hcat_prep'=>q\drop table if exists hadoop_checkin_2; +create table hadoop_checkin_2 (name string, age int, gpa double) STORED AS TEXTFILE;\ + ,'hadoop' => q\ +jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.ReadWrite -libjars :HCAT_JAR: :THRIFTSERVER: studenttab10k hadoop_checkin_2 +\, + ,'result_table' => 'hadoop_checkin_2' + ,'sql' => q\select * from studenttab10k;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 3 + ,'hcat_prep'=>q\drop table if exists hadoop_checkin_3; +create table hadoop_checkin_3 (name string, cnt int) STORED AS TEXTFILE;\ + ,'hadoop' => q\ +jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.GroupByAge -libjars :HCAT_JAR: :THRIFTSERVER: studenttab10k hadoop_checkin_3 +\, + ,'result_table' => 'hadoop_checkin_3' + ,'sql' => q\select age, count(*) from studenttab10k group by age;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + ], + }, # end g + { + 'name' => 'Hadoop_Read', + 'tests' => [ + { + 'num' => 1 + ,'hadoop' => q\ +jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.ReadText -libjars :HCAT_JAR: :THRIFTSERVER: all100k :OUTPATH: +\, + ,'sql' => q\select * from all100k;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 2 + ,'hadoop' => q\ +jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.ReadJson -libjars :HCAT_JAR: :THRIFTSERVER: all100kjson :OUTPATH: +\, + ,'sql' => q\select s, i, d from all100kjson;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 3 + ,'hadoop' => q\ +jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.ReadRC -libjars :HCAT_JAR: :THRIFTSERVER: all100krc :OUTPATH: +\, + ,'sql' => q\select * from all100krc;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + ], + }, # end g + { + 'name' => 'Hadoop_Write', + 'tests' => [ + { + 'num' => 1 + ,'hcat_prep'=>q\ +drop table if exists hadoop_write_1; +create table hadoop_write_1( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + s string) + row format delimited + fields terminated by ':' + stored as textfile;\ + ,'hadoop' => q\ +jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.WriteText -libjars :HCAT_JAR: :THRIFTSERVER: all100k hadoop_write_1 +\, + ,'result_table' => 'hadoop_write_1' + ,'sql' => q\select * from all100k;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 2 + ,'hcat_prep' => q\ +drop table if exists hadoop_write_2; +create table hadoop_write_2( + s string, + i int, + d double, + m map, + bb array>) + STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat' + INPUTDRIVER 'org.apache.hcatalog.pig.drivers.LoadFuncBasedInputDriver' OUTPUTDRIVER 'org.apache.hcatalog.pig.drivers.StoreFuncBasedOutputDriver' + TBLPROPERTIES ('hcat.pig.loader'='org.apache.pig.builtin.JsonLoader', 'hcat.pig.storer'='org.apache.pig.builtin.JsonStorage', 'hcat.pig.loader.args'= +'s:chararray, i:int, d:double, m:map[chararray], bb:{t:(a:int, b:chararray)}', 'hcat.pig.args.delimiter'=' ');\ + ,'hadoop' => q\ +jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.WriteJson -libjars :HCAT_JAR: :THRIFTSERVER: all100kjson hadoop_write_2 +\, + ,'result_table' => 'hadoop_write_2' + ,'sql' => q\select s, i, d, '', '' from all100kjson;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + { + 'num' => 3 + ,'hcat_prep' => q\ +drop table if exists hadoop_write_3; +create table hadoop_write_3( + name string, + age int, + gpa double) +stored as rcfile +TBLPROPERTIES ( + 'hcat.isd'='org.apache.hcatalog.rcfile.RCFileInputDriver', + 'hcat.osd'='org.apache.hcatalog.rcfile.RCFileOutputDriver' +); +\, + ,'hadoop' => q\ +jar :FUNCPATH:/testudf.jar org.apache.hcatalog.utils.WriteRC -libjars :HCAT_JAR: :THRIFTSERVER: all100krc hadoop_write_3 +\, + ,'result_table' => 'hadoop_write_3' + ,'sql' => q\select * from all100krc;\ + ,'floatpostprocess' => 1 + ,'delimiter' => ' ' + }, + ], + }, # end g + ] +} Index: src/test/e2e/hcatalog/conf/existing_deployer.conf =================================================================== --- src/test/e2e/hcatalog/conf/existing_deployer.conf (revision 1208047) +++ src/test/e2e/hcatalog/conf/existing_deployer.conf (working copy) @@ -22,10 +22,11 @@ # $cfg = { - 'deployer' => 'ExistingClusterDeployer', + 'deployer' => 'HCatExistingClusterDeployer', # hadoop values 'hadoopdir' => $ENV{'PH_CLUSTER'}, + 'hcat_data_dir' => '/user/hcat/test/data', # db values # 'dbuser' => 'pigtester', Index: src/test/e2e/hcatalog/conf/default.conf =================================================================== --- src/test/e2e/hcatalog/conf/default.conf (revision 1208047) +++ src/test/e2e/hcatalog/conf/default.conf (working copy) @@ -27,8 +27,8 @@ $cfg = { #HDFS - 'inpathbase' => '/user/pig/tests/data' - , 'outpathbase' => '/user/pig/out' + 'inpathbase' => '/user/hcat/tests/data' + , 'outpathbase' => '/user/hcat/out' #LOCAL , 'localinpathbase' => "$ENV{PH_LOCAL}/in" @@ -39,34 +39,46 @@ #TEST , 'benchmarkPath' => "$ENV{PH_OUT}/benchmarks" , 'scriptPath' => "$ENV{PH_ROOT}/libexec" - , 'tmpPath' => '/tmp/pigtest' - , 'jythonjar' => "$ENV{PH_JYTHON_JAR}" + , 'tmpPath' => "/tmp/pigtest" + , 'jythonjar' => "$ENV{PH_JYTHON_JAR}" + #TESTDB + , 'dbuser' => 'hcattest' + , 'dbhost' => 'localhost' + , 'dbpasswd' => 'hcattest' + , 'dbdb' => 'hcattestdb' + + #COMMON + , 'metastore.principal' => "$ENV{METASTORE_PRINCIPAL}" + , 'metastore_thrift' => $ENV{'PH_METASTORE_THRIFT'} + , 'thriftserver' => "$ENV{HCAT_URL}" + + #HCAT + , 'hcat_data_dir' => '/user/hcat/tests/data' + , 'hivehome' => $ENV{'PH_HIVE_HOME'} + , 'hcathome' => $ENV{'HCAT_INSTALL_DIR'} + , 'hcatalog.jar' => "$ENV{HCAT_JAR},$ENV{HIVE_ROOT}/build/dist/lib/hive-serde-0.9.0-SNAPSHOT.jar,$ENV{HIVE_ROOT}/build/dist/lib/hive-exec-0.9.0-SNAPSHOT.jar,$ENV{PIG_HOME}/pig-withouthadoop.jar,$ENV{HIVE_ROOT}/build/dist/lib/hive-metastore-0.9.0-SNAPSHOT.jar,$ENV{HIVE_ROOT}/build/dist/lib/libfb303-0.7.0.jar,$ENV{HIVE_ROOT}/build/dist/lib/jdo2-api-2.3-ec.jar" + #PIG , 'testconfigpath' => "$ENV{PH_CLUSTER}" , 'hadoopbin' => "$ENV{PH_CLUSTER_BIN}" , 'funcjarPath' => "$ENV{PH_ROOT}/lib/java" , 'paramPath' => "$ENV{PH_ROOT}/paramfiles" - , 'pigpath' => "$ENV{HCAT_ROOT}" - , 'oldpigpath' => "$ENV{PH_OLDPIG}" - ,'additionaljars' => "$ENV{HCAT_EXTRA_JARS}" + , 'pigpath' => "$ENV{PIG_ROOT}" + , 'oldpigpath' => "$ENV{PH_OLDPIG}" + , 'additionaljars' => "$ENV{HCAT_ROOT}/build/hcatalog/hcatalog-0.3.0-dev.jar:$ENV{HCAT_ROOT}/hive/external/build/metastore/hive-metastore-0.9.0-SNAPSHOT.jar:$ENV{HCAT_ROOT}/hive/external/build/dist/lib/libthrift.jar:$ENV{HCAT_ROOT}/hive/external/build/dist/lib/hive-exec-0.9.0-SNAPSHOT.jar:$ENV{HCAT_ROOT}/hive/external/build/dist/lib/libfb303.jar:$ENV{HCAT_ROOT}/hive/external/build/dist/lib/jdo2-api-2.3-ec.jar:$ENV{'HCAT_INSTALL_DIR'}/etc/hcatalog" - #HADOOP - , 'hadoopHome' => "$ENV{HCAT_ROOT}/lib" - ,'hadoop_classpath' => "$ENV{HCAT_EXTRA_JARS}" - , 'userhomePath' => "$ENV{HOME}" - ,'local.bin' => '/usr/bin' - - ,'logDir' => "$ENV{PH_OUT}/log" - ,'propertiesFile' => "./conf/testpropertiesfile.conf" - ,'harness.console.level' => 'ERROR' + #HADOOP + , 'hadoopHome' => "$ENV{HCAT_ROOT}/lib" + , 'hadoop_classpath' => "$ENV{HCAT_EXTRA_JARS}" + , 'userhomePath' => "$ENV{HOME}" + , 'local.bin' => '/usr/bin' + , 'logDir' => "$ENV{PH_OUT}/log" + , 'propertiesFile' => "./conf/testpropertiesfile.conf" + , 'harness.console.level' => 'ERROR' - #HIVE + #HIVE , 'hive_bin_location' => "$ENV{HIVE_ROOT}/build/dist/bin" + , 'hivehome' => "$ENV{HIVE_HOME}" - , 'metastore.principal' => "$ENV{METASTORE_PRINCIPAL}" - #HCATALOG - ,'thriftserver' => "$ENV{HCAT_URL}" - ,'hcatalog.jar' => "$ENV{HCAT_JAR},file://$ENV{HIVE_ROOT}/lib/thrift-fb303-0.5.0.jar,file://$ENV{HIVE_ROOT}/lib/thrift-0.5.0.jar,file://$ENV{HIVE_ROOT}/build/metastore/hive-metastore-0.8.0-SNAPSHOT.jar,file://$ENV{HIVE_ROOT}/build/common/hive-common-0.8.0-SNAPSHOT.jar,file://$ENV{HIVE_ROOT}/build/shims/hive-shims-0.8.0-SNAPSHOT.jar,file://$ENV{HIVE_ROOT}/build/serde/hive-serde-0.8.0-SNAPSHOT.jar,file://$ENV{HIVE_ROOT}/build/ql/hive-exec-0.8.0-SNAPSHOT.jar" -#,'hcat_bin_location' => "$ENV{HCAT_ROOT}/bin/hcat.sh" }; Index: src/test/e2e/hcatalog/deployers/ExistingClusterDeployer.pm =================================================================== --- src/test/e2e/hcatalog/deployers/ExistingClusterDeployer.pm (revision 1208047) +++ src/test/e2e/hcatalog/deployers/ExistingClusterDeployer.pm (working copy) @@ -1,277 +0,0 @@ -############################################################################ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -package ExistingClusterDeployer; - -use IPC::Run qw(run); -use TestDeployer; - -use strict; -use English; - -########################################################################### -# Class: ExistingClusterDeployer -# Deploy the Pig harness to a cluster and database that already exists. - -############################################################################## -# Sub: new -# Constructor -# -# Paramaters: -# None -# -# Returns: -# None. -sub new -{ - my $proto = shift; - my $class = ref($proto) || $proto; - my $self = {}; - - bless($self, $class); - - return $self; -} - -############################################################################## -# Sub: checkPrerequisites -# Check any prerequisites before a deployment is begun. For example if a -# particular deployment required the use of a database system it could -# check here that the db was installed and accessible. -# -# Paramaters: -# globalHash - hash from config file, including deployment config -# log - log file handle -# -# Returns: -# None -# -sub checkPrerequisites -{ - my ($self, $cfg, $log) = @_; - - # They must have declared the conf directory for their Hadoop installation - if (! defined $cfg->{'testconfigpath'} || $cfg->{'testconfigpath'} eq "") { - print $log "You must set the key 'hadoopconfdir' to your Hadoop conf directory " - . "in existing.conf\n"; - die "hadoopconfdir is not set in existing.conf\n"; - } - - # They must have declared the executable path for their Hadoop installation - if (! defined $cfg->{'hadoopbin'} || $cfg->{'hadoopbin'} eq "") { - print $log "You must set the key 'hadoopbin' to your Hadoop bin path" - . "in existing.conf\n"; - die "hadoopbin is not set in existing.conf\n"; - } - - # Run a quick and easy Hadoop command to make sure we can - $self->runHadoopCmd($cfg, $log, "fs -ls /"); - -} - -############################################################################## -# Sub: deploy -# Deploy any required packages -# This is a no-op in this case because we're assuming both the cluster and the -# database already exist -# -# Paramaters: -# globalHash - hash from config file, including deployment config -# log - log file handle -# -# Returns: -# None -# -sub deploy -{ -} - -############################################################################## -# Sub: start -# Start any software modules that are needed. -# This is a no-op in this case because we're assuming both the cluster and the -# database already exist -# -# Paramaters: -# globalHash - hash from config file, including deployment config -# log - log file handle -# -# Returns: -# None -# -sub start -{ -} - -############################################################################## -# Sub: generateData -# Generate any data needed for this test run. -# -# Paramaters: -# globalHash - hash from config file, including deployment config -# log - log file handle -# -# Returns: -# None -# -sub generateData -{ - my ($self, $cfg, $log) = @_; - my @tables = ( - { - 'name' => "numbers", - 'filetype' => "rcfile", - 'hdfs' => "numbers_pig", - }, { - 'name' => "boolean", - 'filetype' => "rcfile", - 'hdfs' => "boolean", - }, { - 'name' => "complex", - 'filetype' => "rcfile", - 'hdfs' => "complex", - }, { - 'name' => "numbers", - 'filetype' => "txt", - 'hdfs' => "txt", - }, { - 'name' => "numbers", - 'filetype' => "rcfile", - 'hdfs' => "numbers", - }, - ); - - # Create the HDFS directories - $self->runHadoopCmd($cfg, $log, "fs -mkdir $cfg->{'inpathbase'}"); - - foreach my $table (@tables) { - print "Generating data for $table->{'name'}\n"; - # Copy the data to HDFS - my $hadoop = "fs -copyFromLocal data/$table->{'name'}.$table->{'filetype'} ". - "$cfg->{'inpathbase'}/$table->{'hdfs'}/$table->{'name'}.$table->{'filetype'}"; - $self->runHadoopCmd($cfg, $log, $hadoop); - - } -} - -############################################################################## -# Sub: confirmDeployment -# Run checks to confirm that the deployment was successful. When this is -# done the testing environment should be ready to run. -# -# Paramaters: -# globalHash - hash from config file, including deployment config -# log - log file handle -# -# Returns: -# Nothing -# This method should die with an appropriate error message if there is -# an issue. -# -sub confirmDeployment -{ -} - -############################################################################## -# Sub: deleteData -# Remove any data created that will not be removed by undeploying. -# -# Paramaters: -# globalHash - hash from config file, including deployment config -# log - log file handle -# -# Returns: -# None -# -sub deleteData -{ -} - -############################################################################## -# Sub: stop -# Stop any servers or systems that are no longer needed once testing is -# completed. -# -# Paramaters: -# globalHash - hash from config file, including deployment config -# log - log file handle -# -# Returns: -# None -# -sub stop -{ -} - -############################################################################## -# Sub: undeploy -# Remove any packages that were installed as part of the deployment. -# -# Paramaters: -# globalHash - hash from config file, including deployment config -# log - log file handle -# -# Returns: -# None -# -sub undeploy -{ -} - -############################################################################## -# Sub: confirmUndeployment -# Run checks to confirm that the undeployment was successful. When this is -# done anything that must be turned off or removed should be turned off or -# removed. -# -# Paramaters: -# globalHash - hash from config file, including deployment config -# log - log file handle -# -# Returns: -# Nothing -# This method should die with an appropriate error message if there is -# an issue. -# -sub confirmUndeployment -{ - die "$0 INFO : confirmUndeployment is a virtual function!"; -} - -sub runHadoopCmd($$$$) -{ - my ($self, $cfg, $log, $c) = @_; - - # set the PIG_CLASSPATH environment variable - $ENV{'HADOOP_CLASSPATH'} = "$cfg->{'testconfigpath'}"; - - my @cmd = ("$cfg->{'hadoopbin'}"); - push(@cmd, split(' ', $c)); - - $self->runCmd($log, \@cmd); -} - -sub runCmd($$$) -{ - my ($self, $log, $cmd) = @_; - - print $log "Going to run " . join(" ", @$cmd) . "\n"; - - run($cmd, \undef, $log, $log) or - die "Failed running " . join(" ", @$cmd) . "\n"; -} - -1; Index: src/test/e2e/hcatalog/deployers/HCatExistingClusterDeployer.pm =================================================================== --- src/test/e2e/hcatalog/deployers/HCatExistingClusterDeployer.pm (revision 0) +++ src/test/e2e/hcatalog/deployers/HCatExistingClusterDeployer.pm (revision 0) @@ -0,0 +1,340 @@ +############################################################################ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +package HCatExistingClusterDeployer; + +use IPC::Run qw(run); +use TestDeployer; +use Util; + +use strict; +use English; + +our @ISA = "TestDeployer"; + +########################################################################### +# Class: HiveExistingClusterDeployer +# Deploy the Pig harness to a cluster and database that already exists. + +############################################################################## +# Sub: new +# Constructor +# +# Paramaters: +# None +# +# Returns: +# None. +sub new +{ + my $proto = shift; + my $class = ref($proto) || $proto; + my $self = {}; + + bless($self, $class); + + return $self; +} + +############################################################################## +# Sub: checkPrerequisites +# Check any prerequisites before a deployment is begun. For example if a +# particular deployment required the use of a database system it could +# check here that the db was installed and accessible. +# +# Paramaters: +# globalHash - hash from config file, including deployment config +# log - log file handle +# +# Returns: +# None +# +sub checkPrerequisites +{ + my ($self, $cfg, $log) = @_; + + if (! defined $ENV{'HADOOP_HOME'} || $ENV{'HADOOP_HOME'} eq "") { + print $log "You must set the environment variable HADOOP_HOME"; + die "HADOOP_HOME not defined"; + } + + # Run a quick and easy Hadoop command to make sure we can + Util::runHadoopCmd($cfg, $log, "fs -ls /"); + +} + +############################################################################## +# Sub: deploy +# Deploy any required packages +# This is a no-op in this case because we're assuming both the cluster and the +# database already exist +# +# Paramaters: +# globalHash - hash from config file, including deployment config +# log - log file handle +# +# Returns: +# None +# +sub deploy +{ +} + +############################################################################## +# Sub: start +# Start any software modules that are needed. +# This is a no-op in this case because we're assuming both the cluster and the +# database already exist +# +# Paramaters: +# globalHash - hash from config file, including deployment config +# log - log file handle +# +# Returns: +# None +# +sub start +{ +} + +############################################################################## +# Sub: generateData +# Generate any data needed for this test run. +# +# Paramaters: +# globalHash - hash from config file, including deployment config +# log - log file handle +# +# Returns: +# None +# +sub generateData +{ + my ($self, $cfg, $log) = @_; + my @tables = ( + { + 'name' => "studenttab10k", + 'filetype' => "studenttab", + 'rows' => 10000, + 'hdfs' => "studenttab10k", + }, { + 'name' => "votertab10k", + 'filetype' => "votertab", + 'rows' => 10000, + 'hdfs' => "votertab10k", + }, { + 'name' => "studentparttab30k", + 'filetype' => "studentparttab", + 'rows' => 10000, + 'hdfs' => "studentparttab30k", + 'partitions' => ['20110924', '20110925', '20110926'] + },{ + 'name' => "studentnull10k", + 'filetype' => "studentnull", + 'rows' => 10000, + 'hdfs' => "studentnull10k", + },{ + 'name' => "all100k", + 'filetype' => "allscalars", + 'rows' => 100000, + 'hdfs' => "all100k", + },{ + 'name' => "all100kjson", + 'filetype' => "json", + 'rows' => 100000, + 'hdfs' => "all100kjson", + },{ + 'name' => "all100krc", + 'filetype' => "studenttab", + 'rows' => 100000, + 'hdfs' => "all100krc", + 'format' => "rc", + } + ); + + + if (defined($cfg->{'load_hive_only'}) && $cfg->{'load_hive_only'} == 1) { + return $self->hiveMetaOnly($cfg, $log, \@tables); + } + + # Create the HDFS directories + Util::runHadoopCmd($cfg, $log, "fs -mkdir $cfg->{'hcat_data_dir'}"); + + foreach my $table (@tables) { + print "Generating data for $table->{'name'}\n"; + # Generate the data + my @cmd; + if (defined($table->{'format'})) { + @cmd = ($cfg->{'gentool'}, $table->{'filetype'}, $table->{'rows'}, + $table->{'name'}, $cfg->{'hcat_data_dir'}, $table->{'format'}); + } else { + @cmd = ($cfg->{'gentool'}, $table->{'filetype'}, $table->{'rows'}, + $table->{'name'}, $cfg->{'hcat_data_dir'}); + } + $self->runCmd($log, \@cmd); + + # Copy the data to HDFS + my $hadoop = "fs -mkdir $cfg->{'hcat_data_dir'}/$table->{'hdfs'}"; + Util::runHadoopCmd($cfg, $log, $hadoop); + + if (defined($table->{'partitions'})) { + foreach my $part (@{$table->{'partitions'}}) { + my $hadoop = "fs -mkdir + $cfg->{'hcat_data_dir'}/$table->{'hdfs'}/$table->{'name'}.$part"; + Util::runHadoopCmd($cfg, $log, $hadoop); + my $hadoop = "fs -copyFromLocal $table->{'name'}.$part " . + "$cfg->{'hcat_data_dir'}/$table->{'hdfs'}/$table->{'name'}.$part/$table->{'name'}.$part"; + Util::runHadoopCmd($cfg, $log, $hadoop); + } + } else { + my $hadoop = "fs -copyFromLocal $table->{'name'} ". + "$cfg->{'hcat_data_dir'}/$table->{'hdfs'}/$table->{'name'}"; + Util::runHadoopCmd($cfg, $log, $hadoop); + } + + print "Loading data into Hive for $table->{'name'}\n"; + Util::runHCatCmdFromFile($cfg, $log, + "./" . $table->{'name'} . ".hcat.sql"); + + print "Loading data into MySQL for $table->{'name'}\n"; + Util::runDbCmd($cfg, $log, $table->{'name'} . ".mysql.sql"); + } + +} + +########################################################################### +# Sub: hiveMetaOnly +# Load metadata into Hive, but don't load Mysql or HDFS, as we assume +# these have already been loaded. +# +# Paramaters: +# cfg - hash from config file, including deployment config +# log - log file handle +# +# Returns: +# None +# +sub hiveMetaOnly +{ + my ($self, $cfg, $log, $tables) = @_; + foreach my $table (@{$tables}) { + print "Generating data for $table->{'name'}\n"; + # Generate the data + my @cmd = ($cfg->{'gentool'}, $table->{'filetype'}, $table->{'rows'}, + $table->{'name'}, $cfg->{'hcat_data_dir'}); + $self->runCmd($log, \@cmd); + + print "Loading data into Hive for $table->{'name'}\n"; + Util::runHCatCmdFromFile($cfg, $log, "./" . $table->{'name'} . + ".hive.sql"); + } +} + +############################################################################## +# Sub: confirmDeployment +# Run checks to confirm that the deployment was successful. When this is +# done the testing environment should be ready to run. +# +# Paramaters: +# globalHash - hash from config file, including deployment config +# log - log file handle +# +# Returns: +# Nothing +# This method should die with an appropriate error message if there is +# an issue. +# +sub confirmDeployment +{ +} + +############################################################################## +# Sub: deleteData +# Remove any data created that will not be removed by undeploying. +# +# Paramaters: +# globalHash - hash from config file, including deployment config +# log - log file handle +# +# Returns: +# None +# +sub deleteData +{ +} + +############################################################################## +# Sub: stop +# Stop any servers or systems that are no longer needed once testing is +# completed. +# +# Paramaters: +# globalHash - hash from config file, including deployment config +# log - log file handle +# +# Returns: +# None +# +sub stop +{ +} + +############################################################################## +# Sub: undeploy +# Remove any packages that were installed as part of the deployment. +# +# Paramaters: +# globalHash - hash from config file, including deployment config +# log - log file handle +# +# Returns: +# None +# +sub undeploy +{ +} + +############################################################################## +# Sub: confirmUndeployment +# Run checks to confirm that the undeployment was successful. When this is +# done anything that must be turned off or removed should be turned off or +# removed. +# +# Paramaters: +# globalHash - hash from config file, including deployment config +# log - log file handle +# +# Returns: +# Nothing +# This method should die with an appropriate error message if there is +# an issue. +# +sub confirmUndeployment +{ + die "$0 INFO : confirmUndeployment is a virtual function!"; +} + +sub runCmd($$$) +{ + my ($self, $log, $cmd) = @_; + + print $log "Going to run [" . join(" ", @$cmd) . "]\n"; + + run($cmd, \undef, $log, $log) or + die "Failed running " . join(" ", @$cmd) . "\n"; +} + +1; Index: src/test/e2e/hcatalog/drivers/TestDriverHadoop.pm =================================================================== --- src/test/e2e/hcatalog/drivers/TestDriverHadoop.pm (revision 0) +++ src/test/e2e/hcatalog/drivers/TestDriverHadoop.pm (revision 0) @@ -0,0 +1,735 @@ +package TestDriverHadoop; + +############################################################################ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################### +# Test driver for pig nightly tests. +# +# + +use TestDriver; +use IPC::Run; # don't do qw(run), it screws up TestDriver which also has a run method +use Digest::MD5 qw(md5_hex); +use Util; +use File::Path; +use Cwd; + +use English; + +our $className= "TestDriver"; +our @ISA = "$className"; +our $ROOT = (defined $ENV{'HARNESS_ROOT'} ? $ENV{'HARNESS_ROOT'} : die "ERROR: You must set environment variable HARNESS_ROOT\n"); +our $toolpath = "$ROOT/libexec/HCatTest"; + +my $passedStr = 'passed'; +my $failedStr = 'failed'; +my $abortedStr = 'aborted'; +my $skippedStr = 'skipped'; +my $dependStr = 'failed_dependency'; + +sub new +{ + # Call our parent + my ($proto) = @_; + my $class = ref($proto) || $proto; + my $self = $class->SUPER::new; + + bless($self, $class); + return $self; +} + +sub replaceParameters +{ +##!!! Move this to Util.pm + + my ($self, $cmd, $outfile, $testCmd, $log) = @_; + + # $self + $cmd =~ s/:LATESTOUTPUTPATH:/$self->{'latestoutputpath'}/g; + + # $outfile + $cmd =~ s/:OUTPATH:/$outfile/g; + + # $ENV + $cmd =~ s/:PIGHARNESS:/$ENV{HARNESS_ROOT}/g; + + # $testCmd + $cmd =~ s/:INPATH:/$testCmd->{'inpathbase'}/g; + $cmd =~ s/:OUTPATH:/$outfile/g; + $cmd =~ s/:FUNCPATH:/$testCmd->{'funcjarPath'}/g; + $cmd =~ s/:PIGPATH:/$testCmd->{'pigpath'}/g; + $cmd =~ s/:RUNID:/$testCmd->{'UID'}/g; + $cmd =~ s/:USRHOMEPATH:/$testCmd->{'userhomePath'}/g; + $cmd =~ s/:MAPREDJARS:/$testCmd->{'mapredjars'}/g; + $cmd =~ s/:SCRIPTHOMEPATH:/$testCmd->{'scriptPath'}/g; + $cmd =~ s/:DBUSER:/$testCmd->{'dbuser'}/g; + $cmd =~ s/:DBNAME:/$testCmd->{'dbdb'}/g; +# $cmd =~ s/:LOCALINPATH:/$testCmd->{'localinpathbase'}/g; +# $cmd =~ s/:LOCALOUTPATH:/$testCmd->{'localoutpathbase'}/g; +# $cmd =~ s/:LOCALTESTPATH:/$testCmd->{'localpathbase'}/g; + $cmd =~ s/:BMPATH:/$testCmd->{'benchmarkPath'}/g; + $cmd =~ s/:TMP:/$testCmd->{'tmpPath'}/g; + $cmd =~ s/:HDFSTMP:/tmp\/$testCmd->{'runid'}/g; + + if ( $testCmd->{'hadoopSecurity'} eq "secure" ) { + $cmd =~ s/:REMOTECLUSTER:/$testCmd->{'remoteSecureCluster'}/g; + } else { + $cmd =~ s/:REMOTECLUSTER:/$testCmd->{'remoteNotSecureCluster'}/g; + } + + $cmd =~ s/:THRIFTSERVER:/$testCmd->{'thriftserver'}/g; + $cmd =~ s/:HADOOP_CLASSPATH:/$testCmd->{'hadoop_classpath'}/g; + $cmd =~ s/:HCAT_JAR:/$testCmd->{'hcatalog.jar'}/g; + + return $cmd; +} + +sub globalSetup +{ + my ($self, $globalHash, $log) = @_; + my $subName = (caller(0))[3]; + + + # Setup the output path + my $me = `whoami`; + chomp $me; + $globalHash->{'runid'} = $me . "." . time; + + # if "-ignore false" was provided on the command line, + # it means do run tests even when marked as 'ignore' + if(defined($globalHash->{'ignore'}) && $globalHash->{'ignore'} eq 'false') + { + $self->{'ignore'} = 'false'; + } + + $globalHash->{'outpath'} = $globalHash->{'outpathbase'} . "/" . $globalHash->{'runid'} . "/"; + $globalHash->{'localpath'} = $globalHash->{'localpathbase'} . "/" . $globalHash->{'runid'} . "/"; + + # add libexec location to the path + if (defined($ENV{'PATH'})) { + $ENV{'PATH'} = $globalHash->{'scriptPath'} . ":" . $ENV{'PATH'}; + } + else { + $ENV{'PATH'} = $globalHash->{'scriptPath'}; + } + + my @cmd = ($self->getPigCmd($globalHash, $log), '-e', 'mkdir', $globalHash->{'outpath'}); + + print $log "Going to run " . join(" ", @cmd) . "\n"; + IPC::Run::run(\@cmd, \undef, $log, $log) or die "Cannot create HDFS directory " . $globalHash->{'outpath'} . ": $? - $!\n"; + + IPC::Run::run(['mkdir', '-p', $globalHash->{'localpath'}], \undef, $log, $log) or + die "Cannot create localpath directory " . $globalHash->{'localpath'} . + " " . "$ERRNO\n"; + + IPC::Run::run(['mkdir', '-p', $globalHash->{'benchmarkPath'}], \undef, $log, $log) or + die "Cannot create benchmark directory " . $globalHash->{'benchmarkPath'} . + " " . "$ERRNO\n"; + + # Create the temporary directory + IPC::Run::run(['mkdir', '-p', $globalHash->{'tmpPath'}], \undef, $log, $log) or + die "Cannot create temporary directory " . $globalHash->{'tmpPath'} . + " " . "$ERRNO\n"; + + # Create the HDFS temporary directory + @cmd = ($self->getPigCmd($globalHash, $log), '-e', 'mkdir', "tmp/$globalHash->{'runid'}"); + print $log "Going to run " . join(" ", @cmd) . "\n"; + IPC::Run::run(\@cmd, \undef, $log, $log) or die "Cannot create HDFS directory " . $globalHash->{'outpath'} . ": $? - $!\n"; +} + +sub globalCleanup +{ +} + + +sub runTest +{ + my ($self, $testCmd, $log) = @_; + my $subName = (caller(0))[3]; + + # Handle the various methods of running used in + # the original TestDrivers + + if ( $testCmd->{'hcat_prep'} ) { + Util::prepareHCat($self, $testCmd, $log); + } + + if ( $testCmd->{'hadoop'} ) { + my $result; + if (defined($testCmd->{'result_table'})) { + $result = $self->runHadoop( $testCmd, $log ); + my @results = (); + my @outputs = (); + if (ref($testCmd->{'result_table'}) ne 'ARRAY') { + $results[0] = $testCmd->{'result_table'}; + } else { + @results = @{$testCmd->{'result_table'}}; + } + + my $id = 0; # regular ouput count + for (my $i = 0; $i < @results; $i++) { + if ($results[$i] ne '?') { + my %modifiedTestCmd = %{$testCmd}; + $pigfiles[$i] = $testCmd->{'localpath'} . + $testCmd->{'group'} . "_" . $testCmd->{'num'} . + ".dumptable.$i.pig"; + $outfiles[$i] = $testCmd->{'thisResultsPath'} . "/" . + $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".$i.out"; + $tableName = $results[$i]; + $modifiedTestCmd{'num'} = $testCmd->{'num'} . "_" . $i . "_benchmark"; + $modifiedTestCmd{'pig'} = "a = load '$tableName' using org.apache.hcatalog.pig.HCatLoader(); store a into ':OUTPATH:';"; + my $r = $self->runPig(\%modifiedTestCmd, $log, 1); + $outputs[$i] = $r->{'output'}; + } else { + $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out/$id"; + my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + + # Copy result file out of hadoop + my @baseCmd = $self->getPigCmd($testCmd, $log); + my $testOut = $self->postProcessSingleOutputFile($outfile, $localdir, \@baseCmd, $testCmd, $log); + $outputs[$i] = $testOut; + $id++; + } + } + $result->{'outputs'}=\@outputs; + if ($self->countStores($testCmd)==1) { + $result->{'output'}=$outputs[0]; + } + } + else { + $result = $self->runHadoop( $testCmd, $log ); + } + return $result; + } else { + die "$subName FATAL Did not find a testCmd that I know how to handle"; + } +} + +sub dumpPigTable +{ + my ($self, $testCmd, $table, $log, $id) = @_; + my $subName = (caller(0))[3]; + + my %result; + + # Write the pig script to a file. + my $pigfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . ".dump.pig"; + my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . "dump.out"; + + open(FH, "> $pigfile") or die "Unable to open file $pigfile to write pig script, $ERRNO\n"; + print FH "a = load '$table' using org.apache.hcatalog.pig.HCatLoader(); store a into '$outfile';\n"; + close(FH); + + + # Build the command + my @baseCmd = $self->getPigCmd($testCmd, $log); + my @cmd = @baseCmd; + + push(@cmd, $pigfile); + + + # Run the command + print $log "Setting PIG_CLASSPATH to $ENV{'PIG_CLASSPATH'}\n"; + print $log "$0::$className::$subName INFO: Going to run pig command: @cmd\n"; + + IPC::Run::run(\@cmd, \undef, $log, $log) or + die "Failed running $pigfile\n"; + $result{'rc'} = $? >> 8; + + + # Get results from the command locally + my $localoutfile; + my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . ".dump.out"; + + $outfile = $self->postProcessSingleOutputFile($outfile, $localdir, \@baseCmd, $testCmd, $log); + return $outfile; +} + +sub postProcessSingleOutputFile +{ + my ($self, $outfile, $localdir, $baseCmd, $testCmd, $log) = @_; + my $subName = (caller(0))[3]; + + my @baseCmd = @{$baseCmd}; + my @copyCmd = @baseCmd; + push(@copyCmd, ('-e', 'copyToLocal', $outfile, $localdir)); + print $log "$0::$className::$subName INFO: Going to run pig command: @copyCmd\n"; + + IPC::Run::run(\@copyCmd, \undef, $log, $log) or die "Cannot copy results from HDFS $outfile to $localdir\n"; + + + # Sort the result if necessary. Keep the original output in one large file. + # Use system not IPC run so that the '*' gets interpolated by the shell. + + # Build command to: + # 1. Combine part files + my $fppCmd = "cat $localdir/map* $localdir/part* 2>/dev/null"; + + # 2. Standardize float precision + if (defined $testCmd->{'floatpostprocess'} && + defined $testCmd->{'delimiter'}) { + $fppCmd .= " | $toolpath/floatpostprocessor.pl '" . + $testCmd->{'delimiter'} . "'"; + } + + $fppCmd .= " > $localdir/out_original"; + + # run command + print $log "$fppCmd\n"; + system($fppCmd); + + # Sort the results for the benchmark compare. + my @sortCmd = ('sort', "$localdir/out_original"); + print $log join(" ", @sortCmd) . "\n"; + IPC::Run::run(\@sortCmd, '>', "$localdir/out_sorted"); + + return "$localdir/out_sorted"; +} + +sub runHadoop +# Being modified from runPig +# !!! Works, but need to add other arguments, like queue...??? +{ + my ($self, $testCmd, $log) = @_; + my $subName = (caller(0))[3]; + + my %result; + + # Write the hadoop command to a file. + my $hadoopfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".hadoop"; + my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + + my $hadoopcmd = $self->replaceParameters( $testCmd->{'hadoop'}, $outfile, $testCmd, $log ); + + # adjust for the leading and trailing new line often seen in the conf file's command directives + $hadoopcmd =~ s/^\s*(.*?)\s*$/\1/s; + + open(FH, "> $hadoopfile") or die "Unable to open file $hadoopfile to write hadoop command file, $ERRNO\n"; + print FH $hadoopcmd . "\n"; + close(FH); + + + # Build the command + my @cmd = Util::getHadoopCmd($testCmd); + + # Add command line arguments if they're provided + if (defined($testCmd->{'hadoop_cmdline_args'})) { + push(@cmd, @{$testCmd->{'hadoop_cmdline_args'}}); + } + + # Add the test command elements + push(@cmd, split(/ +/,$hadoopcmd)); + + # Set HADOOP_CLASSPATH environment variable if provided + if (defined($testCmd->{'hadoop_classpath'})) { + my $hadoop_classpath = $self->replaceParameters( $testCmd->{'hadoop_classpath'}, $outfile, $testCmd, $log ); + my $cp = $testCmd->{'hcatalog.jar'}; + $cp =~ s/,/:/g; + $ENV{'HADOOP_CLASSPATH'} = $cp; + } + + if (defined($testCmd->{'metastore.principal'}) && ($testCmd->{'metastore.principal'} =~ m/\S+/)) { + $ENV{'HADOOP_OPTS'} = "-Dhcat.metastore.principal=" . $testCmd->{'metastore.principal'}; + $ENV{'HADOOP_CLIENT_OPTS'} = "-Dhcat.metastore.principal=" . $testCmd->{'metastore.principal'}; + } + + # Add su user if provided + if (defined($testCmd->{'run_as'})) { + my $cmd = '"' . join (" ", @cmd) . '"'; + @cmd = ("echo", $cmd, "|", "su", $testCmd->{'run_as'}); + } + + my $script = $hadoopfile . ".sh"; + open(FH, ">$script") or die "Unable to open file $script to write script, $ERRNO\n"; + print FH join (" ", @cmd) . "\n"; + close(FH); + my @result=`chmod +x $script`; + + # Run the command + print $log "$0::$className::$subName INFO: Going to run hadoop command in shell script: $script\n"; + print $log "$0::$className::$subName INFO: Going to run hadoop command: " . join(" ", @cmd) . "\n"; + print $log "With HADOOP_CLASSPATH set to " . $ENV{'HADOOP_CLASSPATH'} . " and HADOOP_OPTS set to " . $ENV{'HADOOP_OPTS'} . "\n"; + + my @runhadoop = ("$script"); + IPC::Run::run(\@runhadoop, \undef, $log, $log) or + die "Failed running $script\n"; + + my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . ".dump.out"; + my @baseCmd = $self->getPigCmd($testCmd, $log); + if ($self->countStores($testCmd)==1) { + @outputs = (); + $outputs[0] = $self->postProcessSingleOutputFile($outfile, $localdir, \@baseCmd, $testCmd, $log); + $result{'outputs'} = \@outputs; + } + + return \%result; +} # end sub runHadoop + + +sub compare +{ + my ($self, $testResult, $benchmarkResult, $log, $testCmd) = @_; + my $subName = (caller(0))[3]; + + my $result; + + if (defined($testResult->{'outputs'})) { + my $res = 0; + my @outputs = $testResult->{'outputs'}; + my $count = @outputs; + for (my $id = 0; $id < $count; $id++) { + my $testOutput = ($testResult->{'outputs'})->[$id]; + my $benchmarkOutput = ($benchmarkResult->{'outputs'})->[$id]; + $res += $self->compareSingleOutput($testResult, $testOutput, + $benchmarkOutput, $log); + $result = ($res == ($count)) ? 1 : 0; + } + } else { + $result = $self->compareSingleOutput($testResult, $testResult->{'output'}, + $benchmarkResult->{'output'}, $log); + } + + return $result; +} + +sub generateBenchmark +{ + my ($self, $testCmd, $log) = @_; + + my %result; + + my @SQLQuery = @{$testCmd->{'sql'}}; + my @SQLQuery = (); + if (ref($testCmd->{'sql'}) ne 'ARRAY') { + $SQLQuery[0] = $testCmd->{'sql'}; + } else { + @SQLQuery = @{$testCmd->{'sql'}}; + } + + my @outfiles = (); + for (my $id = 0; $id < ($#SQLQuery + 1); $id++) { + my $sql = $SQLQuery[$id]; + my $outfile = $self->generateSingleSQLBenchmark($testCmd, $sql, ($id+1), $log); + push(@outfiles, $outfile); + } + $result{'outputs'} = \@outfiles; + + return \%result; +} + +sub generateSingleSQLBenchmark +{ + my ($self, $testCmd, $sql, $id, $log) = @_; + + my $qmd5 = substr(md5_hex($testCmd->{'pig'}), 0, 5); + my $sqlfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".benchmark.$id.sql"; + my $outfile = $testCmd->{'benchmarkPath'} . "/" . $testCmd->{'group'} . "_" . $testCmd->{'num'}; + + $outfile .= defined($id) ? ".$id" . ".out" : ".out"; + + my $outfp; + open($outfp, "> $outfile") or + die "Unable to open output file $outfile, $!\n"; + + open(FH, "> $sqlfile") or + die "Unable to open file $sqlfile to write SQL script, $ERRNO\n"; + print FH $sql; + close(FH); + + Util::runDbCmd($testCmd, $log, $sqlfile, $outfp); + + $rcs[$i] = $? >> 8; + close($outfp); + + my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + + $outfile = + $self->postProcessSingleSQLOutputFile($outfile, $testCmd, $log); + + return $outfile; +} + +sub postProcessSingleSQLOutputFile +{ + my ($self, $outfile, $testCmd, $log, $isBenchmark) = @_; + + # If requested, process the data to smooth over floating point + # differences. + if (defined $testCmd->{'floatpostprocess'} && + defined $testCmd->{'delimiter'}) { + # Move the file to a temp file and run through the pre-processor. + my $tmpfile = "$outfile.tmp"; + link($outfile, $tmpfile) or + die "Unable to create temporary file $tmpfile, $!\n"; + unlink($outfile) or + die "Unable to unlink file $outfile, $!\n"; + open(IFH, "< $tmpfile") or + die "Unable to open file $tmpfile, $!\n"; + open(OFH, "> $outfile") or + die "Unable to open file $outfile, $!\n"; + my @cmd = ("$toolpath/floatpostprocessor.pl", + $testCmd->{'delimiter'}); + print $log "Going to run [" . join(" ", @cmd) . "]\n"; + IPC::Run::run(\@cmd, \*IFH, \*OFH, $log) or + die "Failed to run float postprocessor, $!\n"; + close(IFH); + close(OFH); + unlink($tmpfile); + } + + if ($isBenchmark && defined $testCmd->{'nullpostprocess'}) { + # Move the file to a temp file and run through the pre-processor. + my $tmpfile = "$outfile.tmp"; + link($outfile, $tmpfile) or + die "Unable to create temporary file $tmpfile, $!\n"; + unlink($outfile) or + die "Unable to unlink file $outfile, $!\n"; + open(IFH, "< $tmpfile") or + die "Unable to open file $tmpfile, $!\n"; + open(OFH, "> $outfile") or + die "Unable to open file $outfile, $!\n"; + my @cmd = ("sed", "s/NULL//g"); + print $log "Going to run [" . join(" ", @cmd) . "]\n"; + IPC::Run::run(\@cmd, \*IFH, \*OFH, $log) or + die "Failed to run float postprocessor, $!\n"; + close(IFH); + close(OFH); + unlink($tmpfile); + } + + # Sort the results for the benchmark compare. + my $sortfile = "$outfile.sorted"; + my @cmd = ("sort", $outfile); + print $log "Going to run [" . join(" ", @cmd) . "]\n"; + IPC::Run::run(\@cmd, '>', "$sortfile"); + + return $sortfile; +} + +sub runPig +{ + my ($self, $testCmd, $log, $copyResults) = @_; + my $subName = (caller(0))[3]; + + my %result; + + # Write the pig script to a file. + my $pigfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".pig"; + my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + + my $pigcmd = $self->replaceParameters( $testCmd->{'pig'}, $outfile, $testCmd, $log ); + + open(FH, "> $pigfile") or die "Unable to open file $pigfile to write pig script, $ERRNO\n"; + print FH $pigcmd . "\n"; + close(FH); + + + # Build the command + my @baseCmd = $self->getPigCmd($testCmd, $log); + my @cmd = @baseCmd; + + # Add option -l giving location for secondary logs + my $locallog = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".log"; + push(@cmd, "-logfile"); + push(@cmd, $locallog); + + # Add pig parameters if they're provided + if (defined($testCmd->{'pig_params'})) { + # Processing :PARAMPATH: in parameters + foreach my $param (@{$testCmd->{'pig_params'}}) { + $param =~ s/:PARAMPATH:/$testCmd->{'paramPath'}/g; + } + push(@cmd, @{$testCmd->{'pig_params'}}); + } + + push(@cmd, $pigfile); + + + # Run the command + print $log "Setting PIG_CLASSPATH to $ENV{'PIG_CLASSPATH'}\n"; + print $log "$0::$className::$subName INFO: Going to run pig command: @cmd\n"; + + IPC::Run::run(\@cmd, \undef, $log, $log) or + die "Failed running $pigfile\n"; + $result{'rc'} = $? >> 8; + + + # Get results from the command locally + my $localoutfile; + my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + my $stores = $self->countStores($testCmd); + + # single query + if ($stores == 1) { + if ($copyResults) { + $result{'output'} = $self->postProcessSingleOutputFile($outfile, $localdir, \@baseCmd, $testCmd, $log); + $result{'originalOutput'} = "$localdir/out_original"; # populated by postProcessSingleOutputFile + } else { + $result{'output'} = "NO_COPY"; + } + } + # multi query + else { + my @outfiles = (); + for (my $id = 1; $id <= ($stores); $id++) { + $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out/$id"; + $localoutfile = $outfile . ".$id"; + + # Copy result file out of hadoop + my $testOut; + if ($copyResults) { + $testOut = $self->postProcessSingleOutputFile($localoutfile, $localdir, \@baseCmd, $testCmd, $log); + } else { + $testOut = "NO_COPY"; + } + push(@outfiles, $testOut); + } + ##!!! originalOutputs not set! Needed? + $result{'outputs'} = \@outfiles; + } + + # Compare doesn't get the testCmd hash, so I need to stuff the necessary + # info about sorting into the result. + if (defined $testCmd->{'sortArgs'} && $testCmd->{'sortArgs'}) { + $result{'sortArgs'} = $testCmd->{'sortArgs'}; + } + + return \%result; +} + +sub getPigCmd($$$) +{ + my ($self, $testCmd, $log) = @_; + + my @pigCmd; + + # set the PIG_CLASSPATH environment variable + my $pcp .= $testCmd->{'jythonjar'} if (defined($testCmd->{'jythonjar'})); + $pcp .= ":" . $testCmd->{'classpath'} if (defined($testCmd->{'classpath'})); + $pcp .= ":" . $testCmd->{'additionaljars'} if (defined($testCmd->{'additionaljars'})); + # Only add testconfigpath to PIG_CLASSPATH if HADOOP_HOME isn't defined + $pcp .= ":" . $testCmd->{'testconfigpath'} if ($testCmd->{'exectype'} ne "local"); #&& (! defined $ENV{'HADOOP_HOME'}); + + # Set it in our current environment. It will get inherited by the IPC::Run + # command. + $ENV{'PIG_CLASSPATH'} = $pcp; + + @pigCmd = ("$testCmd->{'pigpath'}/bin/pig"); + + if (defined($testCmd->{'additionaljars'})) { + push(@pigCmd, '-Dpig.additional.jars='.$testCmd->{'additionaljars'}); + } + + if ($testCmd->{'exectype'} eq "local") { + push(@{$testCmd->{'java_params'}}, "-Xmx1024m"); + push(@pigCmd, ("-x", "local")); + } + + my $opts .= "-Dhcat.metastore.uri=$testCmd->{'thriftserver'}"; + if (defined($testCmd->{'java_params'})) { + $opts = $opts . " " . join(" ", @{$testCmd->{'java_params'}}); + } + + $ENV{'PIG_OPTS'} = $opts; + + print $log "Returning Pig command " . join(" ", @pigCmd) . "\n"; + print $log "With PIG_CLASSPATH set to " . $ENV{'PIG_CLASSPATH'} . " and PIG_OPTS set to " . $ENV{'PIG_OPTS'} . "\n"; + return @pigCmd; +} + +sub compareSingleOutput +{ + my ($self, $testResult, $testOutput, $benchmarkOutput, $log) = @_; + +print $log "testResult: $testResult testOutput: $testOutput benchmarkOutput: $benchmarkOutput\n"; + + # cksum the the two files to see if they are the same + my ($testChksm, $benchmarkChksm); + IPC::Run::run((['cat', $testOutput], '|', ['cksum']), \$testChksm, + $log) or die "$0: error: cannot run cksum on test results\n"; + IPC::Run::run((['cat', $benchmarkOutput], '|', ['cksum']), + \$benchmarkChksm, $log) or die "$0: error: cannot run cksum on benchmark\n"; + + chomp $testChksm; + chomp $benchmarkChksm; + print $log "test cksum: $testChksm\nbenchmark cksum: $benchmarkChksm\n"; + + my $result; + if ($testChksm ne $benchmarkChksm) { + print $log "Test output checksum does not match benchmark checksum\n"; + print $log "Test checksum = <$testChksm>\n"; + print $log "Expected checksum = <$benchmarkChksm>\n"; + print $log "RESULTS DIFFER: vimdiff " . cwd . "/$testOutput " . cwd . "/$benchmarkOutput\n"; + } else { + $result = 1; + } + + # Now, check if the sort order is specified + if (defined($testResult->{'sortArgs'})) { + Util::setLocale(); + my @sortChk = ('sort', '-cs'); + push(@sortChk, @{$testResult->{'sortArgs'}}); + push(@sortChk, $testResult->{'originalOutput'}); + print $log "Going to run sort check command: " . join(" ", @sortChk) . "\n"; + IPC::Run::run(\@sortChk, \undef, $log, $log); + my $sortrc = $?; + if ($sortrc) { + print $log "Sort check failed\n"; + $result = 0; + } + } + + return $result; +} + +############################################################################## +# Count the number of stores in a Pig Latin script, so we know how many files +# we need to compare. +# +sub countStores($$) +{ + my ($self, $testCmd) = @_; + + if (defined $testCmd->{'pig'}) { + my $count; + + # hope they don't have more than store per line + # also note that this won't work if you comment out a store + my @q = split(/\n/, $testCmd->{'pig'}); + for (my $i = 0; $i < @q; $i++) { + $count += $q[$i] =~ /store\s+[a-zA-Z][a-zA-Z0-9_]*\s+into/i; + } + + return $count; + + } + else { + #defined $testCmd->{'hadoop'} + my $count; + + my @q = split(/\n/, $testCmd->{'hadoop'}); + for (my $i = 0; $i < @q; $i++) { + $count += $q[$i] =~ /OUTPATH/ig; + } + + return $count; + + } + +} + +1; Index: src/test/e2e/hcatalog/drivers/TestDriverHive.pm =================================================================== --- src/test/e2e/hcatalog/drivers/TestDriverHive.pm (revision 0) +++ src/test/e2e/hcatalog/drivers/TestDriverHive.pm (revision 0) @@ -0,0 +1,383 @@ +package TestDriverHive; + +############################################################################ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################### +# Test driver for hive nightly tests. +# +# + +use TestDriver; +use IPC::Run; # don't do qw(run), it screws up TestDriver which also has a run method +use Digest::MD5 qw(md5_hex); +use Util; +use File::Path; +use Cwd; + +use strict; +use English; + +our $className= "TestDriver"; +our @ISA = "$className"; +our $ROOT = (defined $ENV{'HARNESS_ROOT'} ? $ENV{'HARNESS_ROOT'} : die "ERROR: You must set environment variable HARNESS_ROOT\n"); +our $toolpath = "$ROOT/libexec/HCatTest"; + +my $passedStr = 'passed'; +my $failedStr = 'failed'; +my $abortedStr = 'aborted'; +my $skippedStr = 'skipped'; +my $dependStr = 'failed_dependency'; + +sub new +{ + # Call our parent + my ($proto) = @_; + my $class = ref($proto) || $proto; + my $self = $class->SUPER::new; + + bless($self, $class); + return $self; +} + +sub replaceParameters +{ +##!!! Move this to Util.pm + + my ($self, $cmd, $outfile, $testCmd, $log) = @_; + + # $self + $cmd =~ s/:LATESTOUTPUTPATH:/$self->{'latestoutputpath'}/g; + + # $outfile + $cmd =~ s/:OUTPATH:/$outfile/g; + + # $ENV + $cmd =~ s/:HARNESS:/$ENV{HARNESS_ROOT}/g; + + # $testCmd + $cmd =~ s/:INPATH:/$testCmd->{'inpathbase'}/g; + + return $cmd; +} + +sub globalSetup +{ + my ($self, $globalHash, $log) = @_; + my $subName = (caller(0))[3]; + + # Set up values for the metastore + Util::setupHiveProperties($globalHash, $log); + + # Setup the output path + my $me = `whoami`; + chomp $me; + $globalHash->{'runid'} = $me . "." . time; + + $globalHash->{'localpath'} = $globalHash->{'localpathbase'} . "/" . $globalHash->{'runid'} . "/"; + + IPC::Run::run(['mkdir', '-p', $globalHash->{'localpath'}], \undef, $log, $log) or + die "Cannot create localpath directory " . $globalHash->{'localpath'} . + " " . "$ERRNO\n"; + + IPC::Run::run(['mkdir', '-p', $globalHash->{'benchmarkPath'}], \undef, $log, $log) or + die "Cannot create benchmark directory " . $globalHash->{'benchmarkPath'} . + " " . "$ERRNO\n"; + + $globalHash->{'thisResultsPath'} = $globalHash->{'localpath'} . "/" + . $globalHash->{'resultsPath'}; + IPC::Run::run(['mkdir', '-p', $globalHash->{'thisResultsPath'}], \undef, $log, $log) or + die "Cannot create results directory " . $globalHash->{'thisResultsPath'} . + " " . "$ERRNO\n"; +} + +sub globalCleanup +{ + my ($self, $globalHash, $log) = @_; +} + + +sub runTest +{ + my ($self, $testCmd, $log) = @_; + + my %result; + + my @hivefiles = (); + my @outfiles = (); + # Write the hive script to a file. + $hivefiles[0] = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . + $testCmd->{'num'} . ".0.sql"; + $outfiles[0] = $testCmd->{'thisResultsPath'} . "/" . $testCmd->{'group'} . + "_" . $testCmd->{'num'} . ".0.out"; + + open(FH, "> $hivefiles[0]") or + die "Unable to open file $hivefiles[0] to write SQL script, $ERRNO\n"; + print FH $testCmd->{'sql'} . "\n"; + close(FH); + + # If the results are written to a table run the command and then + # run a another Hive command to dump the results of the table. + if (defined($testCmd->{'result_table'})) { + Util::runHiveCmdFromFile($testCmd, $log, $hivefiles[0]); + $result{'rc'} = $? >> 8; + + my @results = (); + if (ref($testCmd->{'result_table'}) ne 'ARRAY') { + $results[0] = $testCmd->{'result_table'}; + } else { + @results = @{$testCmd->{'result_table'}}; + } + for (my $i = 0; $i < @results; $i++) { + $hivefiles[$i] = $testCmd->{'localpath'} . + $testCmd->{'group'} . "_" . $testCmd->{'num'} . + ".dumptable.$i.sql"; + $outfiles[$i] = $testCmd->{'thisResultsPath'} . "/" . + $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".$i.out"; + open(FH, "> $hivefiles[$i]") or + die "Unable to open file $hivefiles[$i] to write SQL " . + "script, $ERRNO\n"; + print FH "select * from " . $results[$i] . ";\n"; + close(FH); + } + } + + my @originalOutputs = (); + my @outputs = (); + $result{'originalOutput'} = \@originalOutputs; + $result{'output'} = \@outputs; + + for (my $i = 0; $i < @hivefiles; $i++) { + my $outfp; + open($outfp, "> $outfiles[$i]") or + die "Unable to open output file $outfiles[$i], $!\n"; + + Util::runHiveCmdFromFile($testCmd, $log, $hivefiles[$i], $outfp); + + # Don't overwrite rc if we set it above + $result{'rc'} = $? >> 8 unless defined $result{'rc'}; + close($outfp); + + $originalOutputs[$i] = $outfiles[$i]; + $outputs[$i] = + $self->postProcessSingleOutputFile($outfiles[$i], $testCmd, $log); + } + + # Compare doesn't get the testCmd hash, so I need to stuff the necessary + # info about sorting into the result. + if (defined $testCmd->{'sortArgs'} && $testCmd->{'sortArgs'}) { + $result{'sortArgs'} = $testCmd->{'sortArgs'}; + } + + return \%result; +} + + + +sub generateBenchmark +{ + my ($self, $testCmd, $log) = @_; + + my %result; + + # Write the SQL to a file. + my @verifies = (); + if (defined $testCmd->{'verify_sql'}) { + if (ref($testCmd->{'verify_sql'}) eq "ARRAY") { + @verifies = @{$testCmd->{'verify_sql'}}; + } else { + $verifies[0] = $testCmd->{'verify_sql'}; + } + } else { + $verifies[0] = $testCmd->{'sql'}; + } + + my @rcs = (); + $result{'rc'} = \@rcs; + my @outputs = (); + $result{'output'} = \@outputs; + for (my $i = 0; $i < @verifies; $i++) { + my $sqlfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . + $testCmd->{'num'} . ".benchmark.$i.sql"; + my $outfile = $testCmd->{'benchmarkPath'} . "/" . + $testCmd->{'group'} . "_" . $testCmd->{'num'} . + ".benchmark.$i.out"; + + open(FH, "> $sqlfile") or + die "Unable to open file $sqlfile to write SQL script, $ERRNO\n"; + print FH $verifies[$i]; + close(FH); + + my $outfp; + open($outfp, "> $outfile") or + die "Unable to open output file $outfile, $!\n"; + + Util::runDbCmd($testCmd, $log, $sqlfile, $outfp); + $rcs[$i] = $? >> 8; + close($outfp); + + $outputs[$i] = + $self->postProcessSingleOutputFile($outfile, $testCmd, $log, 1); + } + + return \%result; +} + +sub compare +{ + my ($self, $testResult, $benchmarkResult, $log, $testCmd) = @_; + + # Make sure we have the same number of results from runTest and + # generateBenchmark + if (scalar(@{$testResult->{'output'}}) != + scalar(@{$benchmarkResult->{'output'}})) { + die "runTest returned " . scalar(@{$testResult->{'output'}}) . + " results, but generateBenchmark returned " . + scalar(@{$benchmarkResult->{'output'}}) . "\n"; + } + + my $totalFailures = 0; + for (my $i = 0; $i < @{$testResult->{'output'}}; $i++) { + # cksum the the two files to see if they are the same + my ($testChksm, $benchmarkChksm); + IPC::Run::run((['cat', @{$testResult->{'output'}}[$i]], '|', + ['cksum']), \$testChksm, $log) or + die "$0: error: cannot run cksum on test results\n"; + IPC::Run::run((['cat', @{$benchmarkResult->{'output'}}[$i]], '|', + ['cksum']), \$benchmarkChksm, $log) or + die "$0: error: cannot run cksum on benchmark\n"; + + chomp $testChksm; + chomp $benchmarkChksm; + print $log + "test cksum: $testChksm\nbenchmark cksum: $benchmarkChksm\n"; + + if ($testChksm ne $benchmarkChksm) { + print $log "Test output $i checksum does not match benchmark " . + "checksum\n"; + print $log "Test $i checksum = <$testChksm>\n"; + print $log "Expected $i checksum = <$benchmarkChksm>\n"; + print $log "RESULTS DIFFER: vimdiff " . cwd . + "/" . @{$testResult->{'output'}}[$i] . " " . cwd . + "/" . @{$benchmarkResult->{'output'}}[$i] . "\n"; + $totalFailures++; + } + + # Now, check if the sort order is specified + if (defined($testResult->{'sortArgs'})) { + my @sortChk = ('sort', '-cs'); + push(@sortChk, @{$testResult->{'sortArgs'}}); + push(@sortChk, @{$testResult->{'originalOutput'}}[$i]); + print $log "Going to run sort check command: " . + join(" ", @sortChk) . "\n"; + IPC::Run::run(\@sortChk, \undef, $log, $log); + my $sortrc = $?; + if ($sortrc) { + print $log "Sort check failed\n"; + $totalFailures++; + } + } + } + + return $totalFailures == 0; +} + +sub postProcessSingleOutputFile +{ + my ($self, $outfile, $testCmd, $log, $isBenchmark) = @_; + + # If requested, process the data to smooth over floating point + # differences. + if (defined $testCmd->{'floatpostprocess'} && + defined $testCmd->{'delimiter'}) { + # Move the file to a temp file and run through the pre-processor. + my $tmpfile = "$outfile.tmp"; + link($outfile, $tmpfile) or + die "Unable to create temporary file $tmpfile, $!\n"; + unlink($outfile) or + die "Unable to unlink file $outfile, $!\n"; + open(IFH, "< $tmpfile") or + die "Unable to open file $tmpfile, $!\n"; + open(OFH, "> $outfile") or + die "Unable to open file $outfile, $!\n"; + my @cmd = ("$toolpath/floatpostprocessor.pl", + $testCmd->{'delimiter'}); + print $log "Going to run [" . join(" ", @cmd) . "]\n"; + IPC::Run::run(\@cmd, \*IFH, \*OFH, $log) or + die "Failed to run float postprocessor, $!\n"; + close(IFH); + close(OFH); + unlink($tmpfile); + } + + if ($isBenchmark && defined $testCmd->{'nullpostprocess'}) { + # Move the file to a temp file and run through the pre-processor. + my $tmpfile = "$outfile.tmp"; + link($outfile, $tmpfile) or + die "Unable to create temporary file $tmpfile, $!\n"; + unlink($outfile) or + die "Unable to unlink file $outfile, $!\n"; + open(IFH, "< $tmpfile") or + die "Unable to open file $tmpfile, $!\n"; + open(OFH, "> $outfile") or + die "Unable to open file $outfile, $!\n"; + my @cmd = ("sed", "s/NULL//g"); + print $log "Going to run [" . join(" ", @cmd) . "]\n"; + IPC::Run::run(\@cmd, \*IFH, \*OFH, $log) or + die "Failed to run float postprocessor, $!\n"; + close(IFH); + close(OFH); + unlink($tmpfile); + } + + # Sort the results for the benchmark compare. + my $sortfile = "$outfile.sorted"; + my @cmd = ("sort", $outfile); + print $log "Going to run [" . join(" ", @cmd) . "]\n"; + IPC::Run::run(\@cmd, '>', "$sortfile"); + + return $sortfile; +} + + + +############################################################################## +# Count the number of stores in a Pig Latin script, so we know how many files +# we need to compare. +# +sub countStores($$) +{ + my ($self, $testCmd) = @_; + + # Special work around for queries with more than one store that are not + # actually multiqueries. + if (defined $testCmd->{'notmq'}) { + return 1; + } + + my $count; + + # hope they don't have more than store per line + # also note that this won't work if you comment out a store + my @q = split(/\n/, $testCmd->{'pig'}); + for (my $i = 0; $i < @q; $i++) { + $count += $q[$i] =~ /store\s+[a-zA-Z][a-zA-Z0-9_]*\s+into/i; + } + + return $count; +} + +1; Index: src/test/e2e/hcatalog/drivers/TestDriverHCat.pm =================================================================== --- src/test/e2e/hcatalog/drivers/TestDriverHCat.pm (revision 1208047) +++ src/test/e2e/hcatalog/drivers/TestDriverHCat.pm (working copy) @@ -1,44 +1,32 @@ package TestDriverHCat; -################### +############################################################################ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################### +# Test driver for pig nightly tests. # -# In addition to what TestDriverPig.pm does, this should also support: # -# * Hive -# * HCat CLI -# * Hadoop commands with result to stdout (e.g. 'fs') -# * Hadoop mapreduce jobs (i.e. result in files) -# -# The two latter ones are implemented like the 'pig' directive -# in that just 'hadoop' is used, and if the verification directive is 'sql' -# then a benchmark file will be compared with the output file, -# othewise stdout or stderr plus rc can be checked against verification directives. -# -# Based upon the consolidated Pig driver, which -# supports what privously was handled by: -# -# - TestDriverPig.pm -# - TestDriverScript.pm -# - TestDriverPigCmdLine.pm -# - TestDriverPigMultiQuery.pm -# -# Some code are ripe to be factored out. In interest of time, not done now... - -# THINGS STIL TO DEAL WITH MARKED AS: - -##!!! - -# NOTE in particular that postProcessSingleOutputFile might need to be added to some run subs. - -############################################################################### - -#use Miners::Test::TestDriver; use TestDriver; use IPC::Run; # don't do qw(run), it screws up TestDriver which also has a run method use Digest::MD5 qw(md5_hex); use Util; use File::Path; +use Cwd; use strict; use English; @@ -61,49 +49,10 @@ my $class = ref($proto) || $proto; my $self = $class->SUPER::new; - $self->{'exectype'} = "mapred"; # till we know better (in globalSetup())! - $self->{'ignore'} = "true"; # till we know better (in globalSetup())! - bless($self, $class); return $self; } -############################################################################### -# This method has been copied over from TestDriver to make changes to -# support skipping tests which do not match current execution mode -# or which were marked as 'ignore' -# -# -# Static function, can be used by test_harness.pl -# Print the results so far, given the testStatuses hash. -# @param testStatuses - reference to hash of test status results. -# @param log - reference to file handle to print results to. -# @param prefix - A title to prefix to the results -# @returns nothing. -# -sub printResults -{ - my ($testStatuses, $log, $prefix) = @_; - - my ($pass, $fail, $abort, $depend, $skipped) = (0, 0, 0, 0, 0); - - foreach (keys(%$testStatuses)) { - ($testStatuses->{$_} eq $passedStr) && $pass++; - ($testStatuses->{$_} eq $failedStr) && $fail++; - ($testStatuses->{$_} eq $abortedStr) && $abort++; - ($testStatuses->{$_} eq $dependStr) && $depend++; - ($testStatuses->{$_} eq $skippedStr) && $skipped++; - } - - my $total = $pass + $fail + $skipped + $abort + $depend; - - my $msg = "$prefix, PASSED: $pass FAILED: $fail SKIPPED: $skipped ABORTED: $abort " . - "FAILED DEPENDENCY: $depend TOTAL: $total"; - print $log "$msg\n"; - print "$msg\r"; -} - - sub replaceParameters { ##!!! Move this to Util.pm @@ -116,9 +65,6 @@ # $outfile $cmd =~ s/:OUTPATH:/$outfile/g; - # The same directory where .pig, .sh, .hcat, out/ are produced for the run: - $cmd =~ s/:RUNDIR:/$testCmd->{'localpath'}/g; - # $ENV $cmd =~ s/:PIGHARNESS:/$ENV{HARNESS_ROOT}/g; @@ -126,19 +72,19 @@ $cmd =~ s/:INPATH:/$testCmd->{'inpathbase'}/g; $cmd =~ s/:OUTPATH:/$outfile/g; $cmd =~ s/:FUNCPATH:/$testCmd->{'funcjarPath'}/g; + $cmd =~ s/:PIGPATH:/$testCmd->{'pigpath'}/g; $cmd =~ s/:RUNID:/$testCmd->{'UID'}/g; $cmd =~ s/:USRHOMEPATH:/$testCmd->{'userhomePath'}/g; + $cmd =~ s/:MAPREDJARS:/$testCmd->{'mapredjars'}/g; $cmd =~ s/:SCRIPTHOMEPATH:/$testCmd->{'scriptPath'}/g; $cmd =~ s/:DBUSER:/$testCmd->{'dbuser'}/g; $cmd =~ s/:DBNAME:/$testCmd->{'dbdb'}/g; - $cmd =~ s/:LOCALINPATH:/$testCmd->{'localinpathbase'}/g; - $cmd =~ s/:LOCALOUTPATH:/$testCmd->{'localoutpathbase'}/g; +# $cmd =~ s/:LOCALINPATH:/$testCmd->{'localinpathbase'}/g; +# $cmd =~ s/:LOCALOUTPATH:/$testCmd->{'localoutpathbase'}/g; +# $cmd =~ s/:LOCALTESTPATH:/$testCmd->{'localpathbase'}/g; $cmd =~ s/:BMPATH:/$testCmd->{'benchmarkPath'}/g; $cmd =~ s/:TMP:/$testCmd->{'tmpPath'}/g; - $cmd =~ s/:ZEBRAJAR:/$testCmd->{'zebrajar'}/g; - $cmd =~ s/:FILER:/$testCmd->{'filerPath'}/g; - $cmd =~ s/:GRIDSTACK:/$testCmd->{'gridstack.root'}/g; - $cmd =~ s/:USER:/$ENV{USER}/g; + $cmd =~ s/:HDFSTMP:/tmp\/$testCmd->{'runid'}/g; if ( $testCmd->{'hadoopSecurity'} eq "secure" ) { $cmd =~ s/:REMOTECLUSTER:/$testCmd->{'remoteSecureCluster'}/g; @@ -146,37 +92,9 @@ $cmd =~ s/:REMOTECLUSTER:/$testCmd->{'remoteNotSecureCluster'}/g; } - # extra for hive, hcat, hadoop cmd - $cmd =~ s/:THRIFTSERVER:/$testCmd->{'thriftserver'}/g; - $cmd =~ s/:HADOOP_CLASSPATH:/$testCmd->{'hadoop_classpath'}/g; - $cmd =~ s/:HCAT_JAR:/$testCmd->{'hcatalog.jar'}/g; - - # used in script call to `java :CLASSPATH: ...` in bootstrap_hcat.conf - $cmd =~ s/:CLASSPATH:/$testCmd->{'classpath'}/g; - return $cmd; } -sub hiveWorkArounds -{ - my ($self, $cmd, $log) = @_; - my $subName = (caller(0))[3]; - - # return $cmd; - - # Work-around for Hive problem where INSERT OVERWRITE failed when called w/o hive.merge.mapfiles=false - if ($cmd =~ /insert overwrite/i) { - print $log "$0:$subName WARNING: setting hive.merge.mapfiles in command\n"; - $cmd = "\nset hive.merge.mapfiles=false;\n$cmd"; - # } else { - # print $log "$0:$subName DEBUG: NOT setting hive.merge.mapfiles in hive command\n"; - } - - return $cmd; -} - - - sub globalSetup { my ($self, $globalHash, $log) = @_; @@ -195,25 +113,9 @@ $self->{'ignore'} = 'false'; } - # if "-x local" was provided on the command line, - # it implies pig should be run in "local" mode -so - # change input and output paths - if(defined($globalHash->{'x'}) && $globalHash->{'x'} eq 'local') - { - $self->{'exectype'} = "local"; - $globalHash->{'inpathbase'} = $globalHash->{'localinpathbase'}; - $globalHash->{'outpathbase'} = $globalHash->{'localoutpathbase'}; - } $globalHash->{'outpath'} = $globalHash->{'outpathbase'} . "/" . $globalHash->{'runid'} . "/"; $globalHash->{'localpath'} = $globalHash->{'localpathbase'} . "/" . $globalHash->{'runid'} . "/"; - # extract the current zebra.jar file path from the classpath - # and enter it in the hash for use in the substitution of :ZEBRAJAR: - my $zebrajar = $globalHash->{'cp'}; - $zebrajar =~ s/zebra.jar.*/zebra.jar/; - $zebrajar =~ s/.*://; - $globalHash->{'zebrajar'} = $zebrajar; - # add libexec location to the path if (defined($ENV{'PATH'})) { $ENV{'PATH'} = $globalHash->{'scriptPath'} . ":" . $ENV{'PATH'}; @@ -221,120 +123,38 @@ else { $ENV{'PATH'} = $globalHash->{'scriptPath'}; } - - my $tmpUsePig = $globalHash->{'use-pig.pl'}; - $globalHash->{'use-pig.pl'} = 1; - my @cmd = (Util::getBasePigCmd($globalHash), '-e', 'mkdir', $globalHash->{'outpath'}); - $globalHash->{'use-pig.pl'} = $tmpUsePig; - - if($self->{'exectype'} eq "local") - { - @cmd = ('mkdir', '-p', $globalHash->{'outpath'}); - } - - - if($self->{'exectype'} eq "mapred") - { - my $id = `id -un`; - chomp $id; - if ($id eq 'root') { - # my @suCmd = ('su', 'hadoopqa', '-c', "'" . join(' ', @cmd) . "'"); - # print $log join(" ", @suCmd) . "\n"; - # IPC::Run::run(\@suCmd, \undef, $log, $log) or die "Cannot create HDFS directory " . $globalHash->{'outpath'} . ": $? - $!\n"; - # above failed, doing below for now... - - my $command= join (" ", @cmd); - $command = "echo \"$command\" | su hadoopqa 2>&1"; - print $log "$command\n"; - my @result=`$command`; - my $rc = $? >> 8; - print $log "Output from create HDFS directory: " . join (" ", @result) . "\n"; - die "Cannot create HDFS directory " . $globalHash->{'outpath'} . ": $? - $!\n" if $rc != 0; - - } else { - print $log join(" ", @cmd) . "\n"; - IPC::Run::run(\@cmd, \undef, $log, $log) or die "Cannot create HDFS directory " . $globalHash->{'outpath'} . ": $? - $!\n"; - } - } - else - { - IPC::Run::run(\@cmd, \undef, $log, $log) or die "Cannot create directory " . $globalHash->{'outpath'} . "\n"; - } - - IPC::Run::run(['mkdir', '-p', $globalHash->{'localpath'}], \undef, $log, $log) or - die "Cannot create localpath directory " . $globalHash->{'localpath'} . - " " . "$ERRNO\n"; } -sub getCommand +sub globalCleanup { - my ($self, $testCmd ) = @_; - - if( $testCmd->{'pig'} ){ - return "pig"; - } elsif( $testCmd->{'hadoop'} ){ - return "hadoop"; - } elsif( $testCmd->{'hive'} ){ - return "hive"; - } elsif( $testCmd->{'hcat'} ){ - return "hcat"; - } elsif( $testCmd->{'script'} ){ - return "script"; - } else { - return ""; - } } + sub runTest { - my ($self, $testCmd, $log, $copyResults) = @_; + my ($self, $testCmd, $log) = @_; my $subName = (caller(0))[3]; - # check is root if using 'run_as' - if (defined($testCmd->{'run_as'}) && $testCmd->{'run_as'} ne '') { - my $id = `id -un`; - chomp $id; - if ($id ne 'root') { - die "$subName FATAL You have to run as root to use the 'run_as' directive, you are: $id"; - } - } - # Handle the various methods of running used in # the original TestDrivers - if ( $testCmd->{'pig'} && $self->hasCommandLineVerifications( $testCmd, $log) ) { - return $self->runPigCmdLine( $testCmd, $log, $copyResults ); - } elsif( $testCmd->{'pig'} ){ - return $self->runPig( $testCmd, $log, $copyResults ); - } elsif ( $testCmd->{'hadoop'} && $self->hasCommandLineVerifications( $testCmd, $log) ) { - return $self->runHadoopCmdLine( $testCmd, $log, $copyResults ); - } elsif( $testCmd->{'hadoop'} ){ - return $self->runHadoop( $testCmd, $log, $copyResults ); - } elsif( $testCmd->{'hive'} ){ - return $self->runHive( $testCmd, $log, $copyResults ); - } elsif( $testCmd->{'hcat'} ){ - return $self->runHCat( $testCmd, $log, $copyResults ); - } elsif( $testCmd->{'script'} ){ - return $self->runScript( $testCmd, $log ); + if ( $testCmd->{'hcat'} ) { + return $self->runHCatCmdLine( $testCmd, $log, 1); } else { die "$subName FATAL Did not find a testCmd that I know how to handle"; } } -sub runPigCmdLine +sub runHCatCmdLine { my ($self, $testCmd, $log) = @_; my $subName = (caller(0))[3]; my %result; - - # Set up file locations - my $pigfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".pig"; my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - + my $hcatCmd = $self->replaceParameters( $testCmd->{'hcat'}, $outfile, $testCmd, $log); my $outdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - my $stdoutfile = "$outdir/stdout"; - my $stderrfile = "$outdir/stderr"; + my ($stdoutfile, $stderrfile); mkpath( [ $outdir ] , 0, 0755) if ( ! -e outdir ); if ( ! -e $outdir ){ @@ -342,874 +162,36 @@ die "$0.$subName FATAL could not mkdir $outdir\n"; } - # Write the pig script to a file. - my $pigcmd = $self->replaceParameters( $testCmd->{'pig'}, $outfile, $testCmd, $log ); + open($stdoutfile, "> $outdir/stdout"); + open($stderrfile, "> $outdir/stderr"); - open(FH, "> $pigfile") or die "Unable to open file $pigfile to write pig script, $ERRNO\n"; - print FH $pigcmd . "\n"; - close(FH); + my @hcatfiles = (); + my @outfiles = (); + # Write the hive script to a file. + $hcatfiles[0] = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . + $testCmd->{'num'} . ".0.sql"; + $outfiles[0] = $testCmd->{'thisResultsPath'} . "/" . $testCmd->{'group'} . + "_" . $testCmd->{'num'} . ".0.out"; - # Build the command - my @baseCmd = Util::getBasePigCmd($testCmd); - my @cmd = @baseCmd; - - # Add option -l giving location for secondary logs - ##!!! Should that even be here? - my $locallog = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".log"; - push(@cmd, "-logfile"); - push(@cmd, $locallog); - - # Add pig parameters if they're provided - if (defined($testCmd->{'pig_params'})) { - # Processing :PARAMPATH: in parameters - foreach my $param (@{$testCmd->{'pig_params'}}) { - $param =~ s/:PARAMPATH:/$testCmd->{'paramPath'}/g; - } - push(@cmd, @{$testCmd->{'pig_params'}}); - } - - # Add pig file and redirections - push(@cmd, $pigfile); - my $command= join (" ", @cmd); - # Add su user if provided - if (defined($testCmd->{'run_as'})) { - $command = 'echo "' . $command . '"' . " | su $testCmd->{'run_as'}"; - } - $command= "$command 1> $stdoutfile 2> $stderrfile"; - - - # Run the command - print $log "$0:$subName Going to run command: $command\n"; - print $log "$0:$subName STD OUT IS IN FILE: $stdoutfile\n"; - print $log "$0:$subName STD ERROR IS IN FILE: $stderrfile\n"; - print $log "$0:$subName PIG SCRIPT FILE, $pigfile, CONTAINS:\n<$pigcmd>\n"; - - my @result=`$command`; - $result{'rc'} = $? >> 8; - $result{'output'} = $outfile; - $result{'stdout'} = `cat $stdoutfile`; - $result{'stderr'} = `cat $stderrfile`; - $result{'stderr_file'} = $stderrfile; - - # Here and other run* should do: - # If expected rc defined and = 0 and actual rc <> 0 then - # die "Failed running $pigfile\n"; - - print $log "STD ERROR CONTAINS:\n<$result{'stderr'}>\n"; - - return \%result; -} - -sub runHive -# The code is based on the run runHadoopCmdLine, -# but with the difference that it's output from stdout -# can be used for both comparions against benchmark file and -# verification by pattern matching depending on wether the test -# has a 'sql' or a pattern match directive. -{ - my ($self, $testCmd, $log) = @_; - my $subName = (caller(0))[3]; - my %result; - - # Set up file locations - my $hivefile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".hive"; - # my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - - my $outdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - my $stdoutfile = "$outdir/stdout"; - my $stderrfile = "$outdir/stderr"; - - my $outfile = $stdoutfile; # For Hive, there is only the stdout, and that is being captured by - # the way the command is run. So, the hive command should _not_ use :OUTFILE: - - mkpath( [ $outdir ] , 0, 0755) if ( ! -e $outdir ); - if ( ! -e $outdir ){ - print $log "$0.$subName FATAL could not mkdir $outdir\n"; - die "$0.$subName FATAL could not mkdir $outdir\n"; - } - - # Write the hive command to a file. - my $hivecmd = $self->replaceParameters( $testCmd->{'hive'}, $outfile, $testCmd, $log ); - - $hivecmd = $self->hiveWorkArounds( $hivecmd, $log ); - - - open(FH, "> $hivefile") or die "Unable to open file $hivefile to write hive command, $ERRNO\n"; - print FH "$hivecmd\n"; + open(FH, "> $hcatfiles[0]") or + die "Unable to open file $hcatfiles[0] to write SQL script, $ERRNO\n"; + print FH $testCmd->{'hcat'} . "\n"; close(FH); - # Build the command - my @cmd = Util::getHiveCmd($testCmd); - - #Add metastore info - push(@cmd, "--hiveconf hive.metastore.local=false --hiveconf hive.metastore.uris=thrift://".$testCmd->{'thriftserver'}); - - - if( defined($testCmd->{'metastore.principal'}) && ($testCmd->{'metastore.principal'} =~ m/\S+/) - && ($testCmd->{'metastore.principal'} ne '${metastore.principal}')){ - push(@cmd, "--hiveconf hive.metastore.sasl.enabled=true --hiveconf hive.metastore.kerberos.principal=$testCmd->{'metastore.principal'}"); - } else { - push(@cmd, "--hiveconf hive.metastore.sasl.enabled=false"); - } - - # Add hive command file - push(@cmd, '-f', $hivefile); - - # Add redirections - # no need to split, as not using IPC run. - my $command= join (" ", @cmd); - # Add hive command line arguments if they're provided - if (defined($testCmd->{'hive_cmdline_args'})) { - $command = $command . $testCmd->{'hive_cmdline_args'} - } - - # Add su user if provided - if (defined($testCmd->{'run_as'})) { - $command = "echo \"$command\" | su $testCmd->{'run_as'}"; - } - $command= "$command 1> $stdoutfile 2> $stderrfile"; - - - # Run the command - print $log "$0:$subName Going to run command: $command\n"; - print $log "$0:$subName STD OUT IS IN FILE: $stdoutfile\n"; - print $log "$0:$subName STD ERROR IS IN FILE: $stderrfile\n"; - print $log "$0:$subName HIVE QUERY FILE, $hivefile, CONTAINS:\n<$hivecmd>\n"; - - my @result=`$command`; + Util::runHCatCmdFromFile($testCmd, $log, $hcatfiles[0], $stdoutfile, $stderrfile); $result{'rc'} = $? >> 8; - $result{'output'} = $outfile; - $result{'stdout'} = `cat $stdoutfile`; # This could be big. Left for now, as compareScript relies on it - $result{'stderr'} = `cat $stderrfile`; - $result{'stderr_file'} = $stderrfile; - - print $log "STD ERROR CONTAINS:\n<$result{'stderr'}>\n"; - - $result{'output'} = $self->postProcessSingleOutputFile($outfile, $outdir, undef, $testCmd, $log); - $result{'originalOutput'} = "$outdir/out_original"; # populated by postProcessSingleOutputFile - + $result{'stdout'} = `cat $outdir/stdout`; + $result{'stderr'} = `cat $outdir/stderr`; + $result{'stderr_file'} = "$outdir/stderr"; return \%result; -} # end sub runHive - - -sub runHCat -# COPY of runHive for now -# When HCat CLI is implemented, then change!!! -{ - my ($self, $testCmd, $log) = @_; - my $subName = (caller(0))[3]; - my %result; - - # Set up file locations - my $hcatfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".hcat"; - # my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - - my $outdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - my $stdoutfile = "$outdir/stdout"; - my $stderrfile = "$outdir/stderr"; - - my $outfile = $stdoutfile; # For HCat, there is only the stdout, and that is being captured by - # the way the command is run. So, the hcat command should _not_ use :OUTFILE: - - mkpath( [ $outdir ] , 0, 0755) if ( ! -e $outdir ); - if ( ! -e $outdir ){ - print $log "$0.$subName FATAL could not mkdir $outdir\n"; - die "$0.$subName FATAL could not mkdir $outdir\n"; - } - - # Write the hcat command to a file. - my $hcatcmd = $self->replaceParameters( $testCmd->{'hcat'}, $outfile, $testCmd, $log ); - # $hcatcmd = "set hive.metastore.uris=thrift://gwbl2004.blue.ygrid.yahoo.com:9080;\n$hcatcmd"; - - $hcatcmd = $self->hiveWorkArounds( $hcatcmd, $log ); - - open(FH, "> $hcatfile") or die "Unable to open file $hcatfile to write hcat command, $ERRNO\n"; - print FH $hcatcmd . "\n"; - close(FH); - - # Build the command - my @cmd = Util::getHCatCmd($testCmd); - - # Add hcat command line arguments if they're provided - if (defined($testCmd->{'hcat_cmdline_args'})) { - push(@cmd, @{$testCmd->{'hcat_cmdline_args'}}); - } - - # Add hcat command file - if (defined($testCmd->{'hcat_cmdline_use_-e_switch'})) { - if (defined($testCmd->{'run_as'})) { - push(@cmd, '-e', '\"' . $hcatcmd . '\"'); - } else { - push(@cmd, '-e', '"' . $hcatcmd . '"'); - } - } else { - push(@cmd, '-f', $hcatfile); - } - - # Add redirections - # no need to split, as not using IPC run. - my $command= join (" ", @cmd); - # Add su user if provided - if (defined($testCmd->{'run_as'})) { - $command = "echo \"$command\" | su $testCmd->{'run_as'}"; - } - $command= "$command 1> $stdoutfile 2> $stderrfile"; - - # Run the command - print $log "$0:$subName Going to run command: $command\n"; - print $log "$0:$subName STD OUT IS IN FILE: $stdoutfile\n"; - print $log "$0:$subName STD ERROR IS IN FILE: $stderrfile\n"; - print $log "$0:$subName HCAT QUERY FILE, $hcatfile, CONTAINS:\n<$hcatcmd>\n"; - - my @result=`$command`; - $result{'rc'} = $? >> 8; - $result{'output'} = $outfile; - $result{'stdout'} = `cat $stdoutfile`; # This could be big. Left for now, as compareScript relies on it - $result{'stderr'} = `cat $stderrfile`; - $result{'stderr_file'} = $stderrfile; - - print $log "STD ERROR CONTAINS:\n<$result{'stderr'}>\n"; - - $result{'output'} = $self->postProcessSingleOutputFile($outfile, $outdir, undef, $testCmd, $log); - $result{'originalOutput'} = "$outdir/out_original"; # populated by postProcessSingleOutputFile - - return \%result; -} # end sub runHCat - - -sub runHadoopCmdLine -# Modified from runPigCmdLine -# !!! Works, but need to add other arguments, like queue...??? -{ - my ($self, $testCmd, $log) = @_; - my $subName = (caller(0))[3]; - my %result; - - # Set up file locations - my $hadoopfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".hadoop"; - my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - - my $outdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - my $stdoutfile = "$outdir/stdout"; - my $stderrfile = "$outdir/stderr"; - - mkpath( [ $outdir ] , 0, 0755) if ( ! -e outdir ); - if ( ! -e $outdir ){ - print $log "$0.$subName FATAL could not mkdir $outdir\n"; - die "$0.$subName FATAL could not mkdir $outdir\n"; - } - - # Write the hadoop command to a file. - my $hadoopcmd = $self->replaceParameters( $testCmd->{'hadoop'}, $outfile, $testCmd, $log ); - - # adjust for the leading and trailing new line often seen in the conf file's command directives - $hadoopcmd =~ s/^\s*(.*?)\s*$/\1/s; - - open(FH, "> $hadoopfile") or die "Unable to open file $hadoopfile to write hadoop command, $ERRNO\n"; - print FH $hadoopcmd . "\n"; - close(FH); - - # Build the command - my @baseCmd = Util::getHadoopCmd($testCmd); - my @cmd = @baseCmd; - - # Add command line arguments if they're provided - if (defined($testCmd->{'hadoop_cmdline_args'})) { - push(@cmd, @{$testCmd->{'hadoop_cmdline_args'}}); - } - - # Add hadoop command and redirections - push(@cmd, $hadoopcmd); # no need to split, as not using IPC run. - my $command= join (" ", @cmd); - # Add su user if provided - if (defined($testCmd->{'run_as'})) { - $command = "echo \"$command\" | su $testCmd->{'run_as'}"; - } - $command= "$command 1> $stdoutfile 2> $stderrfile"; - - #Set HADOOP_CLASSPATH environment variable if provided - if (defined($testCmd->{'hadoop_classpath'})) { - my $hadoop_classpath = $self->replaceParameters( $testCmd->{'hadoop_classpath'}, $outfile, $testCmd, $log ); - $ENV{'HADOOP_CLASSPATH'} = $ENV{'HCAT_EXTRA_JARS'}; - } - my $hadoop_opts = "-Dhive.metastore.uris=thrift://".$testCmd->{'thriftserver'}." -Dhcat.metastore.uri=thrift://".$testCmd->{'thriftserver'}; - if( defined($testCmd->{'metastore.principal'}) && ($testCmd->{'metastore.principal'} =~ m/\S+/) - && ($testCmd->{'metastore.principal'} ne '${metastore.principal}')){ - $hadoop_opts = join '',$hadoop_opts," -Dhive.metastore.sasl.enabled=true -Dhcat.metastore.principal=", - $testCmd->{'metastore.principal'}," -Dhive.metastore.kerberos.principal=",$testCmd->{'metastore.principal'}; - } else { - $hadoop_opts = join '',$hadoop_opts," -Dhive.metastore.sasl.enabled=false"; - } - $ENV{'HADOOP_OPTS'} = $hadoop_opts; - # Run the command - print $log "$0:$subName Going to run command: $command\n"; - print $log "$0:$subName STD OUT IS IN FILE: $stdoutfile\n"; - print $log "$0:$subName STD ERROR IS IN FILE: $stderrfile\n"; - print $log "$0:$subName HADOOP COMMAND FILE, $hadoopfile, CONTAINS:\n<$hadoopcmd>\n"; - - my @result=`$command`; - $result{'rc'} = $? >> 8; - # $result{'output'} = $outfile; - $result{'output'} = $stdoutfile; - $result{'stdout'} = `cat $stdoutfile`; - $result{'stderr'} = `cat $stderrfile`; - $result{'stderr_file'} = $stderrfile; - - print $log "STD ERROR CONTAINS:\n<$result{'stderr'}>\n"; - - #!!!!!!!!!!!!!! IS this be needed here???? - # my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - # $result{'output'} = $self->postProcessSingleOutputFile($outfile, $localdir, \@baseCmd, $testCmd, $log); - # $result{'originalOutput'} = "$outdir/out_original"; # populated by postProcessSingleOutputFile - - return \%result; -} # end sub runHadoopCmdLine - - -sub runScript -{ - my ($self, $testCmd, $log) = @_; - my $subName = (caller(0))[3]; - my %result; - - # Set up file locations - my $script = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".sh"; - my $outdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - - my $outfile = "$outdir/script.out"; - my $stdoutfile = "$outdir/script.out"; - my $stderrfile = "$outdir/script.err"; - - mkpath( [ $outdir ] , 0, 0755) if ( ! -e outdir ); - if ( ! -e $outdir ){ - print $log "$0.$subName FATAL could not mkdir $outdir\n"; - die "$0.$subName FATAL could not mkdir $outdir\n"; - } - - # Write the script to a file - my $cmd = $self->replaceParameters( $testCmd->{'script'}, $outfile, $testCmd, $log ); - - open(FH, ">$script") or die "Unable to open file $script to write script, $ERRNO\n"; - print FH $cmd . "\n"; - close(FH); - - my @result=`chmod +x $script`; - - # Build the command - my $command; - # Add su user if provided - if (defined($testCmd->{'run_as'})) { - $command = "cat $script | su $testCmd->{'run_as'}"; - } else { - $command= "$script"; - } - $command= "$command 1> $stdoutfile 2> $stderrfile"; - - # Run the script - print $log "$0:$subName Going to run command: $command\n"; - print $log "$0:$subName STD OUT IS IN FILE ($stdoutfile)\n"; - print $log "$0:$subName STD ERROR IS IN FILE ($stderrfile)\n"; - print $log "$0:$subName SCRIPT IS IN FILE ($script)\n"; - print $log "$0:$subName SCRIPT CONTAINS:\n<$cmd>\n"; - - @result=`$command`; - $result{'rc'} = $? >> 8; - $result{'output'} = $outfile; - $result{'stdout'} = `cat $stdoutfile`; - $result{'stderr'} = `cat $stderrfile`; - $result{'stderr_file'} = $stderrfile; - - print $log "STD ERROR CONTAINS:\n<$result{'stderr'}>\n"; - - return \%result; } - -sub runHadoop -# Being modified from runPig -# !!! Works, but need to add other arguments, like queue...??? -{ - my ($self, $testCmd, $log, $copyResults) = @_; - my $subName = (caller(0))[3]; - - my %result; - - # Write the hadoop command to a file. - my $hadoopfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".hadoop"; - my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - - my $hadoopcmd = $self->replaceParameters( $testCmd->{'hadoop'}, $outfile, $testCmd, $log ); - - # adjust for the leading and trailing new line often seen in the conf file's command directives - $hadoopcmd =~ s/^\s*(.*?)\s*$/\1/s; - - open(FH, "> $hadoopfile") or die "Unable to open file $hadoopfile to write hadoop command file, $ERRNO\n"; - print FH $hadoopcmd . "\n"; - close(FH); - - - # Build the command - my @cmd = Util::getHadoopCmd($testCmd); - - # Add command line arguments if they're provided - if (defined($testCmd->{'hadoop_cmdline_args'})) { - push(@cmd, @{$testCmd->{'hadoop_cmdline_args'}}); - } - - # Add the test command elements - push(@cmd, split(/ +/,$hadoopcmd)); - - # Set HADOOP_CLASSPATH environment variable if provided - if (defined($testCmd->{'hadoop_classpath'})) { - my $hadoop_classpath = $self->replaceParameters( $testCmd->{'hadoop_classpath'}, $outfile, $testCmd, $log ); - $ENV{'HADOOP_CLASSPATH'} = $ENV{'HCAT_EXTRA_JARS'}; - } - - # Add su user if provided - if (defined($testCmd->{'run_as'})) { - my $cmd = '"' . join (" ", @cmd) . '"'; - @cmd = ("echo", $cmd, "|", "su", $testCmd->{'run_as'}); - } - - my $script = $hadoopfile . ".sh"; - open(FH, ">$script") or die "Unable to open file $script to write script, $ERRNO\n"; - print FH join (" ", @cmd) . "\n"; - close(FH); - my @result=`chmod +x $script`; - - # Run the command - print $log "$0::$className::$subName INFO: Going to run hadoop command in shell script: $script\n"; - print $log "$0::$className::$subName INFO: Going to run hadoop command: " . join(" ", @cmd) . "\n"; - - my @runpig = ("$script"); - IPC::Run::run(\@runpig, \undef, $log, $log) or - die "Failed running $script\n"; - $result{'rc'} = $? >> 8; - - # Get results from the command locally - my @basePigCmd = Util::getBasePigCmd($testCmd); - - my $localoutfile; - my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - my @SQLQuery = @{$testCmd->{'queries'}}; # here only used to determine if single-guery of multi-query - - - # mapreduce - if($self->{'exectype'} eq "mapred") - { - # single query - if ($#SQLQuery == 0) { - if ($copyResults) { - $result{'output'} = $self->postProcessSingleOutputFile($outfile, $localdir, \@basePigCmd, $testCmd, $log); - $result{'originalOutput'} = "$localdir/out_original"; # populated by postProcessSingleOutputFile - } else { - $result{'output'} = "NO_COPY"; - } - } - } - # local mode - else - { - # single query - if ($#SQLQuery == 0) { - $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".dir"; - mkdir $localdir; - $result{'output'} = $self->postProcessSingleOutputFile($outfile, $localdir, \@basePigCmd, $testCmd, $log); - $result{'originalOutput'} = "$localdir/out_original"; # populated by postProcessSingleOutputFile - } - } - - # Compare doesn't get the testCmd hash, so I need to stuff the necessary - # info about sorting into the result. - if (defined $testCmd->{'sortArgs'} && $testCmd->{'sortArgs'}) { - $result{'sortArgs'} = $testCmd->{'sortArgs'}; - } - - return \%result; -} # end sub runHadoop - - - -sub runPig -{ - my ($self, $testCmd, $log, $copyResults) = @_; - my $subName = (caller(0))[3]; - - my %result; - - # Write the pig script to a file. - my $pigfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".pig"; - my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - - my $pigcmd = $self->replaceParameters( $testCmd->{'pig'}, $outfile, $testCmd, $log ); - - open(FH, "> $pigfile") or die "Unable to open file $pigfile to write pig script, $ERRNO\n"; - print FH $pigcmd . "\n"; - close(FH); - - - # Build the command - my @baseCmd = Util::getBasePigCmd($testCmd); - my @cmd = @baseCmd; - - # Add option -l giving location for secondary logs - my $locallog = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".log"; - push(@cmd, "-logfile"); - push(@cmd, $locallog); - - my $pig_opts = "-Dhive.metastore.uris=thrift://".$testCmd->{'thriftserver'}." -Dhcat.metastore.uri=thrift://".$testCmd->{'thriftserver'}; - if( defined($testCmd->{'metastore.principal'}) && ($testCmd->{'metastore.principal'} =~ m/\S+/) - && ($testCmd->{'metastore.principal'} ne '${metastore.principal}')){ - $pig_opts = join '',$pig_opts," -Dhive.metastore.sasl.enabled=true -Dhcat.metastore.principal=", - $testCmd->{'metastore.principal'}," -Dhive.metastore.kerberos.principal=",$testCmd->{'metastore.principal'}; - } else { - $pig_opts = join '',$pig_opts," -Dhive.metastore.sasl.enabled=false"; - } - $ENV{'PIG_OPTS'} = $pig_opts; - - # Add pig parameters if they're provided - if (defined($testCmd->{'pig_params'})) { - # Processing :PARAMPATH: in parameters - foreach my $param (@{$testCmd->{'pig_params'}}) { - $param =~ s/:PARAMPATH:/$testCmd->{'paramPath'}/g; - } - push(@cmd, @{$testCmd->{'pig_params'}}); - } - - push(@cmd, $pigfile); - - # Add su user if provided - if (defined($testCmd->{'run_as'})) { - my $cmd = '"' . join (" ", @cmd) . '"'; - @cmd = ("echo", $cmd, "|", "su", $testCmd->{'run_as'}); - } - - my $script = $pigfile . ".sh"; - open(FH, ">$script") or die "Unable to open file $script to write script, $ERRNO\n"; - print FH join (" ", @cmd) . "\n"; - close(FH); - my @result=`chmod +x $script`; - - # Run the command - print $log "$0::$className::$subName INFO: Going to run pig command in shell script: $script\n"; - print $log "$0::$className::$subName INFO: Going to run pig command: " . join(" ", @cmd) . "\n"; - - my @runpig = ("$script"); - IPC::Run::run(\@runpig, \undef, $log, $log) or - die "Failed running $script\n"; - $result{'rc'} = $? >> 8; - - # Get results from the command locally - my $localoutfile; - my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - my @SQLQuery = @{$testCmd->{'queries'}}; # here only used to determine if single-guery of multi-query - - # mapreduce - if($self->{'exectype'} eq "mapred") - { - # single query - if ($#SQLQuery == 0) { - if ($copyResults) { - $result{'output'} = $self->postProcessSingleOutputFile($outfile, $localdir, \@baseCmd, $testCmd, $log); - $result{'originalOutput'} = "$localdir/out_original"; # populated by postProcessSingleOutputFile - } else { - $result{'output'} = "NO_COPY"; - } - } - # multi query - else { - my @outfiles = (); - for (my $id = 1; $id <= ($#SQLQuery + 1); $id++) { - $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out/$id"; - $localoutfile = $outfile . ".$id"; - - # Copy result file out of hadoop - my $testOut; - if ($copyResults) { - $testOut = $self->postProcessSingleOutputFile($localoutfile, $localdir, \@baseCmd, $testCmd, $log); - } else { - $testOut = "NO_COPY"; - } - push(@outfiles, $testOut); - } - ##!!! originalOutputs not set! Needed? - $result{'outputs'} = \@outfiles; - } - } - # local mode - else - { - # single query - if ($#SQLQuery == 0) { - $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".dir"; - mkdir $localdir; - $result{'output'} = $self->postProcessSingleOutputFile($outfile, $localdir, \@baseCmd, $testCmd, $log); - $result{'originalOutput'} = "$localdir/out_original"; # populated by postProcessSingleOutputFile - } - # multi query - else { - my @outfiles = (); - for (my $id = 1; $id <= ($#SQLQuery + 1); $id++) { - $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - mkdir $localdir; - $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out/$id"; - mkdir $localdir; - $localoutfile = $outfile . ".$id"; - - my $testRes = $self->postProcessSingleOutputFile($localoutfile, $localdir, \@baseCmd, $testCmd, $log); - push(@outfiles, $testRes); - } - ##!!! originalOutputs not set! - $result{'outputs'} = \@outfiles; - } - } - - # Compare doesn't get the testCmd hash, so I need to stuff the necessary - # info about sorting into the result. - if (defined $testCmd->{'sortArgs'} && $testCmd->{'sortArgs'}) { - $result{'sortArgs'} = $testCmd->{'sortArgs'}; - } - - return \%result; -} - - -sub postProcessSingleOutputFile -{ - my ($self, $outfile, $localdir, $baseCmd, $testCmd, $log) = @_; - my $subName = (caller(0))[3]; - - my $from_hdfs; - if ( ($testCmd->{'pig'} || $testCmd->{'hadoop'}) && ($self->{'exectype'} eq "mapred")) { - $from_hdfs = 1; - } else { - $from_hdfs = 0; - } - - # Copy to local if results on HDFS - if ( $from_hdfs ) { - my @baseCmd = @{$baseCmd}; - my @copyCmd = @baseCmd; - push(@copyCmd, ('-e', 'copyToLocal', $outfile, $localdir)); - print $log "$0::$className::$subName INFO: Going to run pig command: " . join(" ", @copyCmd) . "\n"; - - - my $id = `id -un`; - chomp $id; - if ($id eq 'root') { - # my @suCmd = ('su', 'hadoopqa', '-c', "'" . join(' ', @cmd) . "'"); - # print $log join(" ", @suCmd) . "\n"; - # IPC::Run::run(\@suCmd, \undef, $log, $log) or die "Cannot create HDFS directory " . $globalHash->{'outpath'} . ": $? - $!\n"; - # above failed, doing below for now... - - my $command= join (" ", @copyCmd); - $command = "echo \"$command\" | su hadoopqa 2>&1"; - print $log "$command\n"; - my @result=`$command`; - my $rc = $? >> 8; - print $log "Output from copy from HDFS: " . join (" ", @result) . "\n"; - die "Cannot copy results from HDFS $outfile to $localdir\n" if $rc != 0; - - } else { - print $log join(" ", @copyCmd) . "\n"; - IPC::Run::run(\@copyCmd, \undef, $log, $log) or die "Cannot copy results from HDFS $outfile to $localdir\n"; - } - - } - - - # Sort the result if necessary. Keep the original output in one large file. - # Use system not IPC run so that the '*' gets interpolated by the shell. - - # Build command to: - # 1. Combine part files - my $fppCmd = - ($from_hdfs ) ? "cat $localdir/map* $localdir/part* 2>/dev/null" : - (-d $outfile ) ? "cat $outfile/part* 2>/dev/null" : - "cat $outfile"; - - # 2. Standardize float precision - if (defined $testCmd->{'floatpostprocess'} && defined $testCmd->{'delimiter'}) { - $fppCmd .= " | $toolpath/floatpostprocessor '" . $testCmd->{'delimiter'} . "'"; - } - - $fppCmd .= " > $localdir/out_original"; - - # run command - print $log "$fppCmd\n"; - system($fppCmd); - - - # Sort the results for the benchmark compare. - if ( $testCmd->{'sortResults'} eq '1' ) { - my @sortCmd = ('sort', "$localdir/out_original"); - print $log join(" ", @sortCmd) . "\n"; - IPC::Run::run(\@sortCmd, '>', "$localdir/out_sorted"); - return "$localdir/out_sorted"; - } else { - return "$localdir/out_original"; - } -} - -sub generateBenchmark -{ - my ($self, $testCmd, $log) = @_; - - my %result; - - my @SQLQuery = @{$testCmd->{'queries'}}; - - if ($#SQLQuery == 0) { - my $outfile = $self->generateSingleSQLBenchmark($testCmd, $SQLQuery[0], undef, $log); - $result{'output'} = $outfile; - } else { - my @outfiles = (); - for (my $id = 0; $id < ($#SQLQuery + 1); $id++) { - my $sql = $SQLQuery[$id]; - my $outfile = $self->generateSingleSQLBenchmark($testCmd, $sql, ($id+1), $log); - push(@outfiles, $outfile); - } - $result{'outputs'} = \@outfiles; - } - - return \%result; -} - -sub generateSingleSQLBenchmark -{ - my ($self, $testCmd, $sql, $id, $log) = @_; - my $subName = (caller(0))[3]; - - my $command_directive; - if( $testCmd->{'pig'} ){ - $command_directive = 'pig'; - } elsif( $testCmd->{'hadoop'} ){ - $command_directive = 'hadoop'; - } elsif( $testCmd->{'hive'} ){ - $command_directive = 'hive'; - } elsif( $testCmd->{'hcat'} ){ - $command_directive = 'hcat'; - } elsif( $testCmd->{'script'} ){ - $command_directive = 'script'; - } else { - die "$subName FATAL Did not find a testCmd that I know how to handle"; - } - - my $qmd5 = substr(md5_hex($testCmd->{$command_directive}), 0, 5); - my $outfile = $testCmd->{'benchmarkPath'} . "/" . $testCmd->{'group'} . "_" . $testCmd->{'num'}; - $outfile .= defined($id) ? ".$id" . ".expected." . $qmd5 : ".expected." . $qmd5; - - - print $log "Getting benchmark file: $outfile\n"; - - if (-e $outfile) { - return $outfile; - } - - my @cmd = ('psql', '-U', $testCmd->{'dbuser'}, '-d', $testCmd->{'dbdb'}, - '-c', $sql, '-t', '-A', '--pset', "fieldsep=' '", '-o', $outfile); - - - # To facilitate generating the benchmarks manually on a different machine, if postgres db not configured - # cmdForFile is as cmd above just with quotes around the $sql - # Added extension '.sh' becuase the script now also does sort and float postprocessing if applicable - my @cmdForFile = ('psql', '-U', $testCmd->{'dbuser'}, '-d', $testCmd->{'dbdb'}, - '-c', '"'.$sql.'"', '-t', '-A', '--pset', "fieldsep=' '", '-o', $outfile); - my $psqlfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".psql.sh"; - open(FH, "> $psqlfile") or die "Unable to open file $psqlfile to write psql script, $ERRNO\n"; - print FH join(" ", @cmdForFile) . "\n"; - - - # Prepare to sort and postprocess the result if necessary - my $shellCmd = "cat $outfile"; - if (defined $testCmd->{'floatpostprocess'} && defined $testCmd->{'delimiter'}) { - $shellCmd .= " | $toolpath/floatpostprocessor '" . $testCmd->{'delimiter'} . "' "; - } - - if ( $testCmd->{'sortBenchmark'} eq '1' ) { - $shellCmd .= " | sort"; - if (defined $testCmd->{'sortBenchmarkArgs'}) { # but the pig test conf files don't use that anyway... - $shellCmd .= " " . join(" ", @{$testCmd->{'sortBenchmarkArgs'}}); - } - } - - my $tmpfile = $outfile . ".tmp"; - $shellCmd .= " > $tmpfile"; - - # Complete the writing to file - print FH "$shellCmd\n"; - print FH "mv $tmpfile $outfile\n"; - close(FH); - print $log "SQL command file: $psqlfile\n"; - - # Run... - print $log "Running SQL command [" . join(" ", @cmd) . "\n"; - IPC::Run::run(\@cmd, \undef, $log, $log) or do { - print $log "Sql command <" . $sql . - " failed for >>$testCmd->{group}_$testCmd->{num}<<\n"; - unlink $outfile if ( -e $outfile ); - - die "Sql command failed for >>$testCmd->{group}_$testCmd->{num}<<\n"; - }; - # Use system not IPC run so that any '*' gets interpolated by the shell. - print $log "$shellCmd\n"; - system($shellCmd); - unlink $outfile; - IPC::Run::run ['mv', $tmpfile, $outfile]; - - return $outfile; -} - -sub hasCommandLineVerifications -{ - my ($self, $testCmd, $log) = @_; - - foreach my $key ('rc', 'expected_out', 'expected_out_regex', 'expected_err', 'expected_err_regex', - 'not_expected_out', 'not_expected_out_regex', 'not_expected_err', 'not_expected_err_regex' ) { - if (defined $testCmd->{$key}) { - return 1; - } - } - return 0; -} - - -sub compare -{ - my ($self, $testResult, $benchmarkResult, $log, $testCmd) = @_; - my $subName = (caller(0))[3]; - # Returns 0 (false) for failed test, non-zero (true) for passed test - - # For now, if the test has - # - testCmd pig, and 'sql' for benchmark, then use compareToBenchmark - # - any verification directives formerly used by CmdLine or Script drivers (rc, regex on out and err...) - # then use compareScript even if testCmd is "pig" - # - testCmd script, then use compareScript - # - testCmd pig, and none of the above, then use compareToBenchmark - # - # Later, should add ability to have same tests both verify with the 'script' directives, - # and do a benchmark compare, if it was a pig cmd. E.g. 'rc' could still be checked when - # doing the benchmark compare. - - if( defined $testCmd->{'sql'} ){ - return $self->compareToBenchmark ( $testResult, $benchmarkResult, $log, $testCmd); - } elsif( $self->hasCommandLineVerifications( $testCmd, $log) ){ - return $self->compareScript ( $testResult, $log, $testCmd); - } elsif( $testCmd->{'pig'} ){ - # maybe using a custom benchmark file, and has no 'sql' directive - return $self->compareToBenchmark ( $testResult, $benchmarkResult, $log, $testCmd); - } else { - print $log "$0.$subName WARNING Did not find a comparison method. Use 'noverify' if this is intented.\n"; - return 0; - } -} - - sub compareScript { my ($self, $testResult, $log, $testCmd) = @_; my $subName = (caller(0))[3]; + # IMPORTANT NOTES: # # If you are using a regex to compare stdout or stderr @@ -1224,6 +206,7 @@ my $result = 1; # until proven wrong... + # Return Code if (defined $testCmd->{'rc'}) { print $log "$0::$subName INFO Checking return code " . @@ -1234,92 +217,76 @@ } } - # ???? Will that ever be needed? - my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; - # Standard Out if (defined $testCmd->{'expected_out'}) { - my $pattern = $self->replaceParameters( $testCmd->{'expected_out'}, $outfile, $testCmd, $log ); - print $log "$0::$subName INFO Checking test stdout " . - "as exact match against expected <$pattern>\n"; - if ($testResult->{'stdout'} ne $pattern) { - print $log "$0::$subName INFO Check failed: exact match of <$pattern> expected in stdout: <$testResult->{'stdout'}>\n"; + print $log "$0::$subName INFO Checking test stdout' " . + "as exact match against expected <$testCmd->{'expected_out'}>\n"; + if ($testResult->{'stdout'} ne $testCmd->{'expected_out'}) { + print $log "$0::$subName INFO Check failed: exact match of <$testCmd->{'expected_out'}> expected in stdout: $testResult->{'stdout'}\n"; $result = 0; } } if (defined $testCmd->{'not_expected_out'}) { - my $pattern = $self->replaceParameters( $testCmd->{'not_expected_out'}, $outfile, $testCmd, $log ); print $log "$0::$subName INFO Checking test stdout " . - "as NOT exact match against expected <$pattern>\n"; - if ($testResult->{'stdout'} eq $pattern) { - print $log "$0::$subName INFO Check failed: NON-match of <$pattern> expected to stdout: <$testResult->{'stdout'}>\n"; + "as NOT exact match against expected <$testCmd->{'expected_out'}>\n"; + if ($testResult->{'stdout'} eq $testCmd->{'not_expected_out'}) { + print $log "$0::$subName INFO Check failed: NON-match of <$testCmd->{'expected_out'}> expected to stdout: $testResult->{'stdout'}\n"; $result = 0; } } if (defined $testCmd->{'expected_out_regex'}) { - my $pattern = $self->replaceParameters( $testCmd->{'expected_out_regex'}, $outfile, $testCmd, $log ); print $log "$0::$subName INFO Checking test stdout " . - "for regular expression <$pattern>\n"; - # if ($testResult->{'stdout'} !~ $pattern) { - if ($testResult->{'stdout'} !~ /$pattern/m) { - print $log "$0::$subName INFO Check failed: regex match of <$pattern> expected in stdout: <$testResult->{'stdout'}>\n"; + "for regular expression <$testCmd->{'expected_out_regex'}>\n"; + if ($testResult->{'stdout'} !~ $testCmd->{'expected_out_regex'}) { + print $log "$0::$subName INFO Check failed: regex match of <$testCmd->{'expected_out_regex'}> expected in stdout: $testResult->{'stdout'}\n"; $result = 0; } } if (defined $testCmd->{'not_expected_out_regex'}) { - my $pattern = $self->replaceParameters( $testCmd->{'not_expected_out_regex'}, $outfile, $testCmd, $log ); print $log "$0::$subName INFO Checking test stdout " . - "for NON-match of regular expression <$pattern>\n"; - # if ($testResult->{'stdout'} =~ $pattern) { - if ($testResult->{'stdout'} =~ /$pattern/m) { - print $log "$0::$subName INFO Check failed: regex NON-match of <$pattern> expected in stdout: <$testResult->{'output'}>\n"; - # prints HDFS location, should give local + "for NON-match of regular expression <$testCmd->{'not_expected_out_regex'}>\n"; + if ($testResult->{'stdout'} =~ $testCmd->{'not_expected_out_regex'}) { + print $log "$0::$subName INFO Check failed: regex NON-match of <$testCmd->{'not_expected_out_regex'}> expected in stdout: $testResult->{'stdout'}\n"; $result = 0; } } # Standard Error if (defined $testCmd->{'expected_err'}) { - my $pattern = $self->replaceParameters( $testCmd->{'expected_err'}, $outfile, $testCmd, $log ); print $log "$0::$subName INFO Checking test stderr " . - "as exact match against expected <$pattern>\n"; - if ($testResult->{'stderr'} ne $pattern) { - print $log "$0::$subName INFO Check failed: exact match of <$pattern> expected in stderr: $testResult->{'stderr_file'}\n"; + "as exact match against expected <$testCmd->{'expected_err'}>\n"; + if ($testResult->{'stderr'} ne $testCmd->{'expected_err'}) { + print $log "$0::$subName INFO Check failed: exact match of <$testCmd->{'expected_err'}> expected in stderr: $testResult->{'stderr_file'}\n"; $result = 0; } } if (defined $testCmd->{'not_expected_err'}) { - my $pattern = $self->replaceParameters( $testCmd->{'not_expected_err'}, $outfile, $testCmd, $log ); print $log "$0::$subName INFO Checking test stderr " . - "as NOT an exact match against expected <$pattern>\n"; - if ($testResult->{'stderr'} eq $pattern) { - print $log "$0::$subName INFO Check failed: NON-match of <$pattern> expected to stderr: $testResult->{'stderr_file'}\n"; + "as NOT an exact match against expected <$testCmd->{'expected_err'}>\n"; + if ($testResult->{'stderr'} eq $testCmd->{'not_expected_err'}) { + print $log "$0::$subName INFO Check failed: NON-match of <$testCmd->{'expected_err'}> expected to stderr: $testResult->{'stderr_file'}\n"; $result = 0; } } if (defined $testCmd->{'expected_err_regex'}) { - my $pattern = $self->replaceParameters( $testCmd->{'expected_err_regex'}, $outfile, $testCmd, $log ); print $log "$0::$subName INFO Checking test stderr " . - "for regular expression <$pattern>\n"; - # if ($testResult->{'stderr'} !~ $pattern) { - if ($testResult->{'stderr'} !~ /$pattern/m) { - print $log "$0::$subName INFO Check failed: regex match of <$pattern> expected in stderr: $testResult->{'stderr_file'}\n"; + "for regular expression <$testCmd->{'expected_err_regex'}>\n"; + if ($testResult->{'stderr'} !~ m/$testCmd->{'expected_err_regex'}/ms) { + print $log "$0::$subName INFO Check failed: regex match of <$testCmd->{'expected_err_regex'}> expected in stderr: $testResult->{'stderr_file'}\n"; $result = 0; } } if (defined $testCmd->{'not_expected_err_regex'}) { - my $pattern = $self->replaceParameters( $testCmd->{'not_expected_err_regex'}, $outfile, $testCmd, $log ); print $log "$0::$subName INFO Checking test stderr " . - "for NON-match of regular expression <$pattern>\n"; - # if ($testResult->{'stderr'} =~ $pattern) { - if ($testResult->{'stderr'} =~ /$pattern/m) { - print $log "$0::$subName INFO Check failed: regex NON-match of <$pattern> expected in stderr: $testResult->{'stderr_file'}\n"; + "for NON-match of regular expression <$testCmd->{'not_expected_err_regex'}>\n"; + if ($testResult->{'stderr'} =~ $testCmd->{'not_expected_err_regex'}) { + print $log "$0::$subName INFO Check failed: regex NON-match of <$testCmd->{'not_expected_err_regex'}> expected in stderr: $testResult->{'stderr_file'}\n"; $result = 0; } } @@ -1327,442 +294,16 @@ return $result; } - -sub compareToBenchmark +sub compare { my ($self, $testResult, $benchmarkResult, $log, $testCmd) = @_; - my $subName = (caller(0))[3]; - my $result; - my @SQLQuery = @{$testCmd->{'queries'}}; - - if ($#SQLQuery == 0) { - $result = $self->compareSingleOutput($testResult, $testResult->{'output'}, - $benchmarkResult->{'output'}, $log); - } else { - my $res = 0; - for (my $id = 0; $id < ($#SQLQuery + 1); $id++) { - my $testOutput = ($testResult->{'outputs'})->[$id]; - my $benchmarkOutput = ($benchmarkResult->{'outputs'})->[$id]; - $res += $self->compareSingleOutput($testResult, $testOutput, - $benchmarkOutput, $log); - $result = ($res == ($#SQLQuery + 1)) ? 1 : 0; - } - } - - return $result; + # Return Code + return $self->compareScript ( $testResult, $log, $testCmd); } - -sub compareSingleOutput +sub generateBenchmark { - my ($self, $testResult, $testOutput, $benchmarkOutput, $log) = @_; - - # cksum the the two files to see if they are the same - my ($testChksm, $benchmarkChksm); - IPC::Run::run((['cat', $testOutput], '|', ['cksum']), \$testChksm, - $log) or die "$0: error: cannot run cksum on test results\n"; - IPC::Run::run((['cat', $benchmarkOutput], '|', ['cksum']), - \$benchmarkChksm, $log) or die "$0: error: cannot run cksum on benchmark\n"; - - chomp $testChksm; - chomp $benchmarkChksm; - print $log "test cksum: $testChksm\nbenchmark cksum: $benchmarkChksm\n"; - - my $result; - if ($testChksm ne $benchmarkChksm) { - print $log "Test output checksum does not match benchmark checksum\n"; - print $log "Test checksum = <$testChksm>\n"; - print $log "Expected checksum = <$benchmarkChksm>\n"; - print $log "RESULTS DIFFER: vimdiff $testOutput $benchmarkOutput\n"; - } else { - print $log "Test output matches benchmark file: $benchmarkOutput\n"; - $result = 1; - } - - # Now, check if the sort order is specified - if (defined($testResult->{'sortArgs'})) { - Util::setLocale(); - my @sortChk = ('sort', '-cs'); - push(@sortChk, @{$testResult->{'sortArgs'}}); - push(@sortChk, $testResult->{'originalOutput'}); - print $log "Going to run sort check command: " . join(" ", @sortChk) . "\n"; - IPC::Run::run(\@sortChk, \undef, $log, $log); - my $sortrc = $?; - if ($sortrc) { - print $log "Sort check failed\n"; - $result = 0; - } - } - - return $result; } -############################################################################### -# This method has been copied over from TestDriver to make changes to -# support skipping tests which do not match current execution mode -# -# -# Run all the tests in the configuration file. -# @param testsToRun - reference to array of test groups and ids to run -# @param testsToMatch - reference to array of test groups and ids to match. -# If a test group_num matches any of these regular expressions it will be run. -# @param cfg - reference to contents of cfg file -# @param log - reference to a stream pointer for the logs -# @param dbh - reference database connection -# @param testStatuses- reference to hash of test statuses -# @param confFile - config file name -# @param startat - test to start at. -# @returns nothing -# failed. -# -sub run -{ - my ($self, $testsToRun, $testsToMatch, $cfg, $log, $dbh, $testStatuses, - $confFile, $startat, $logname ) = @_; - my $subName = (caller(0))[3]; - - my $msg=""; - my $testDuration=0; - my $totalDuration=0; - my $groupDuration=0; - - my $sawstart = !(defined $startat); - # Rather than make each driver handle our multi-level cfg, we'll flatten - # the hashes into one for it. - my %globalHash; - - my $runAll = ((scalar(@$testsToRun) == 0) && (scalar(@$testsToMatch) == 0)); - - # Read the global keys - foreach (keys(%$cfg)) { - next if $_ eq 'groups'; - $globalHash{$_} = $cfg->{$_}; - } - - # Do the global setup - $self->globalSetup(\%globalHash, $log); - - # Used in generating Junit XML test report - my $generateJunitReport=1; - my $report=0; - my $properties; - my $xmlDir; - - if ($generateJunitReport) { - $properties= new Properties( 0, $globalHash{'propertiesFile'} ); - - # For the xml directory, use the default directory from the configuration file - # unless the directory was specified in the command line - $xmlDir= $globalHash{'localxmlpathbase'} ."/run". $globalHash{'UID'}; - if ( $globalHash{'reportdir'} ) { - $xmlDir = $globalHash{'reportdir'}; - } - } - - my %groupExecuted; - - # $cfg->{'suite'} needs to be set in bin/miners_test_harness to the name of the test conf file... - if ($cfg->{'suite'}) { - print $log "Beginning suite $cfg->{'suite'} at " . time . ($cfg->{'comment'} ? ", comment: $cfg->{'comment'}" : "") . "\n"; - } else { - print $log "Beginning suite at " . time . ($cfg->{'comment'} ? ", comment: $cfg->{'comment'}" : "") . "\n"; - } - - foreach my $group (@{$cfg->{'groups'}}) { - my %groupHash = %globalHash; - $groupHash{'group'} = $group->{'name'}; - - # Read the group keys - $groupHash{'comment'} = undef; # no inheritance of comments - foreach (keys(%$group)) { - next if $_ eq 'tests'; - $groupHash{$_} = $group->{$_}; - } - - print $log "Beginning group $groupHash{'group'} at " . time . ($groupHash{'comment'} ? ", comment: $groupHash{'comment'}" : "") . "\n"; - - # Run each test - foreach my $test (@{$group->{'tests'}}) { - # Check if we're supposed to run this one or not. - if (!$runAll) { - # check if we are supposed to run this test or not. - my $foundIt = 0; - foreach (@$testsToRun) { - if (/^$groupHash{'group'}(_[0-9]+)?$/) { - if (not defined $1) { - # In this case it's just the group name, so we'll - # run every test in the group - $foundIt = 1; - last; - } else { - # maybe, it at least matches the group - my $num = "_" . $test->{'num'}; - if ($num eq $1) { - $foundIt = 1; - last; - } - } - } - } - foreach (@$testsToMatch) { - my $protoName = $groupHash{'group'} . "_" . $test->{'num'}; - if ($protoName =~ /$_/) { - if (not defined $1) { - # In this case it's just the group name, so we'll - # run every test in the group - $foundIt = 1; - last; - } else { - # maybe, it at least matches the group - my $num = "_" . $test->{'num'}; - if ($num eq $1) { - $foundIt = 1; - last; - } - } - } - } - - next unless $foundIt; - } - - # This is a test, so run it. - my %testHash = %groupHash; - $testHash{'comment'} = undef; # no inheritance of comments - foreach (keys(%$test)) { - $testHash{$_} = $test->{$_}; - } - my $testName = $testHash{'group'} . "_" . $testHash{'num'}; - - if ( $groupExecuted{ $group->{'name'} }== 0 ){ - $groupExecuted{ $group->{'name'} }=1; - - mkpath( [ $xmlDir ] , 1, 0777) if ( ! -e $xmlDir ); - - my $filename = $group->{'name'}.".xml"; - if ($generateJunitReport) { - $report = new TestReport ( $properties, "$xmlDir/$filename" ); - $report->purge(); - } - } - - # Have we not reached the starting point yet? - if (!$sawstart) { - if ($testName eq $startat) { - $sawstart = 1; - } else { - next; - } - } - - # Check that this test doesn't depend on an earlier test or tests - # that failed, or that the test wasn't marked as "ignore". - # Don't abort if that test wasn't run, just assume the - # user knew what they were doing and set it up right. - my $skipThisOne = 0; - foreach (keys(%testHash)) { - if (/^depends_on/ && defined($testStatuses->{$testHash{$_}}) && - $testStatuses->{$testHash{$_}} ne $passedStr) { - - print $log "TEST FAILED DEPENDENCY <$testName> at " . time . - ": depended on $testHash{$_} which returned a status of $testStatuses->{$testHash{$_}}\n"; - - $testStatuses->{$testName} = $dependStr; - $skipThisOne = 1; - last; - } - # if the test is not applicable to current execution mode - # ignore it - if(/^exectype$/i && $testHash{$_} !~ /$self->{'exectype'}/i) - { - print $log "TEST IGNORED <$testName> at " . time . ". Message: running mode ($self->{'exectype'}) and exectype in test ($testHash{'exectype'}) do not match\n"; - $testStatuses->{$testName} = $skippedStr; - $skipThisOne = 1; - last; - } - - # if the test is marked as 'ignore', - # ignore it... unless option to ignore the ignore is in force - if(/^ignore$/i) - { - if($self->{'ignore'} eq 'true') - { - print $log "TEST IGNORED <$testName> at " . time . ". Message: $testHash{'ignore'}\n"; - $testStatuses->{$testName} = $skippedStr; - $skipThisOne = 1; - last; - } - elsif ($testHash{'ignore'} ne 'false') - { - print $log "TEST _NOT_ IGNORED <$testName> at " . time . ". Message: $testHash{'ignore'}\n"; - } - } - } - - if ($skipThisOne) { - printResults($testStatuses, $log, "Results so far"); - next; - } - - # Check if output comparison should be skipped. - my $dontCompareThisOne = 0; # true for tests with key 'noverify' - my $copyResults = 1; # no need to copy output to local if noverify - foreach (keys(%testHash)) { - - if(/^noverify$/i ) - { - $dontCompareThisOne = 1; - $copyResults = 0; - last; - } - } - - # print $log "Beginning test $testName at " . time . "\n"; - print $log "Beginning test $testName at " . time . ($testHash{'comment'} ? ", comment: $testHash{'comment'}" : "") . "\n"; - my %dbinfo = ( - 'testrun_id' => $testHash{'trid'}, - 'test_type' => $testHash{'driver'}, - #'test_file' => $testHash{'file'}, - 'test_file' => $confFile, - 'test_group' => $testHash{'group'}, - 'test_num' => $testHash{'num'}, - ); - my $beginTime = time; - my ($testResult, $benchmarkResult); - eval { - - - my @SQLQuery = split /;/, $testHash{'sql'}; - - # Throw out the last one if it is just space - if ($SQLQuery[$#SQLQuery] =~ /^\s+$/) { $#SQLQuery--; } - - # If the last one is a comment, decrement the count - if ($#SQLQuery > 0 && $SQLQuery[$#SQLQuery] !~ /select/i && $SQLQuery[$#SQLQuery] =~ /--/) { - $#SQLQuery--; - } - - $testHash{'queries'} = \@SQLQuery; - - $testResult = $self->runTest(\%testHash, $log, $copyResults); - my $endTime = time; - $testDuration = $endTime - $beginTime; - - $benchmarkResult = $self->generateBenchmark(\%testHash, $log); - - my $result; - if( $dontCompareThisOne ) { - $result = 1; - print $log "TEST MARKED NOVERIFY <$testName>\n"; - } else { - # implementing: - # Bugzilla Ticket 3850819 - aborted scripts has test counted as failed when using command line verificaitons - if ((defined %testHash->{'rc'}) && (%testHash->{'rc'} == 0) && ($testResult->{'rc'} != 0)) { - die "Test run assumed aborted as 'rc' = 0 expected, but actual 'rc' = $testResult->{'rc'}\n"; - } - - $result = $self->compare($testResult, $benchmarkResult, $log, \%testHash); - } - - my $command = $self->getCommand(\%testHash); - if ($result) { - $msg = "TEST SUCCEEDED <$testName> at " . time . ", command: $command, duration: $testDuration\n"; - $testStatuses->{$testName} = $passedStr; - } else { - $msg = "TEST FAILED <$testName> at " . time . ", command: $command, duration: $testDuration\n"; - $testStatuses->{$testName} = $failedStr; - } - print $log $msg; - - - $dbinfo{'duration'} = $testDuration; - $self->recordResults($result, $testResult, $benchmarkResult, - \%dbinfo, $log); - }; - - if ($@) { - my $endTime = time; - print $log "TEST ABORTED <$testName> at " . time . "\n"; - print $log "$0::$subName FAILED: Failed to run test $testName <$@>\n"; - $testStatuses->{$testName} = $abortedStr; - $testDuration = $endTime - $beginTime; - $dbinfo{'duration'} = $testDuration; - } - - eval { - $dbinfo{'status'} = $testStatuses->{$testName}; - if($dbh) { - $dbh->insertTestCase(\%dbinfo); - } - }; - if ($@) { - chomp $@; - warn "Failed to insert test case info, error <$@>\n"; - } - - $self->cleanup($testStatuses->{$testName}, \%testHash, $testResult, - $benchmarkResult, $log); - - if ($generateJunitReport) { - $report->testcase( $group->{'name'}, $testName, $testDuration, $msg, $testStatuses->{$testName} ) if ( $report ); - } - - $groupDuration = $groupDuration + $testDuration; - $totalDuration = $totalDuration + $testDuration; - printResults($testStatuses, $log, "Results so far"); - } - if ($generateJunitReport && $report ) { - my $reportname= $group->{'name'}; - if ( $globalHash{'reportname'} ) { - $reportname= $globalHash{'reportname'}; - } - # $report->systemOut( $logname, $reportname ); - printGroupResultsXml( $report, $group->{'name'}, $testStatuses, $groupDuration ); - } - $report = 0; - $groupDuration=0; - - } - - # Do the global cleanup - $self->globalCleanup(\%globalHash, $log); -} - -############################################################################## -# Sub: printGroupResultsXml -# Print the results for the group using junit xml schema using values from the testStatuses hash. -# -# Paramaters: -# $report - the report object to use to generate the report -# $groupName - the name of the group to report totals for -# $testStatuses - the hash containing the results for the tests run so far -# $totalDuration- The total time it took to run the group of tests -# -# Returns: -# None. -# -sub printGroupResultsXml -{ - my ( $report, $groupName, $testStatuses, $totalDuration) = @_; - $totalDuration=0 if ( !$totalDuration ); - - my ($pass, $fail, $abort, $depend) = (0, 0, 0, 0); - - foreach my $key (keys(%$testStatuses)) { - if ( $key =~ /^$groupName/ ){ - ($testStatuses->{$key} eq $passedStr) && $pass++; - ($testStatuses->{$key} eq $failedStr) && $fail++; - ($testStatuses->{$key} eq $abortedStr) && $abort++; - ($testStatuses->{$key} eq $dependStr) && $depend++; - } - } - - my $total= $pass + $fail + $abort; - $report->totals( $groupName, $total, $fail, $abort, $totalDuration ); - -} - - 1; - Index: src/test/e2e/hcatalog/drivers/TestDriverPig.pm =================================================================== --- src/test/e2e/hcatalog/drivers/TestDriverPig.pm (revision 0) +++ src/test/e2e/hcatalog/drivers/TestDriverPig.pm (revision 0) @@ -0,0 +1,1000 @@ +package TestDriverPig; + +############################################################################ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################### +# Test driver for pig nightly tests. +# +# + +use TestDriver; +use IPC::Run; # don't do qw(run), it screws up TestDriver which also has a run method +use Digest::MD5 qw(md5_hex); +use Util; +use File::Path; +use Cwd; + +use English; + +our $className= "TestDriver"; +our @ISA = "$className"; +our $ROOT = (defined $ENV{'HARNESS_ROOT'} ? $ENV{'HARNESS_ROOT'} : die "ERROR: You must set environment variable HARNESS_ROOT\n"); +our $toolpath = "$ROOT/libexec/HCatTest"; + +my $passedStr = 'passed'; +my $failedStr = 'failed'; +my $abortedStr = 'aborted'; +my $skippedStr = 'skipped'; +my $dependStr = 'failed_dependency'; + +sub new +{ + # Call our parent + my ($proto) = @_; + my $class = ref($proto) || $proto; + my $self = $class->SUPER::new; + + bless($self, $class); + return $self; +} + +sub replaceParameters +{ +##!!! Move this to Util.pm + + my ($self, $cmd, $outfile, $testCmd, $log) = @_; + + # $self + $cmd =~ s/:LATESTOUTPUTPATH:/$self->{'latestoutputpath'}/g; + + # $outfile + $cmd =~ s/:OUTPATH:/$outfile/g; + + # $ENV + $cmd =~ s/:PIGHARNESS:/$ENV{HARNESS_ROOT}/g; + + # $testCmd + $cmd =~ s/:INPATH:/$testCmd->{'inpathbase'}/g; + $cmd =~ s/:OUTPATH:/$outfile/g; + $cmd =~ s/:FUNCPATH:/$testCmd->{'funcjarPath'}/g; + $cmd =~ s/:PIGPATH:/$testCmd->{'pigpath'}/g; + $cmd =~ s/:RUNID:/$testCmd->{'UID'}/g; + $cmd =~ s/:USRHOMEPATH:/$testCmd->{'userhomePath'}/g; + $cmd =~ s/:MAPREDJARS:/$testCmd->{'mapredjars'}/g; + $cmd =~ s/:SCRIPTHOMEPATH:/$testCmd->{'scriptPath'}/g; + $cmd =~ s/:DBUSER:/$testCmd->{'dbuser'}/g; + $cmd =~ s/:DBNAME:/$testCmd->{'dbdb'}/g; +# $cmd =~ s/:LOCALINPATH:/$testCmd->{'localinpathbase'}/g; +# $cmd =~ s/:LOCALOUTPATH:/$testCmd->{'localoutpathbase'}/g; +# $cmd =~ s/:LOCALTESTPATH:/$testCmd->{'localpathbase'}/g; + $cmd =~ s/:BMPATH:/$testCmd->{'benchmarkPath'}/g; + $cmd =~ s/:TMP:/$testCmd->{'tmpPath'}/g; + $cmd =~ s/:HDFSTMP:/tmp\/$testCmd->{'runid'}/g; + + if ( $testCmd->{'hadoopSecurity'} eq "secure" ) { + $cmd =~ s/:REMOTECLUSTER:/$testCmd->{'remoteSecureCluster'}/g; + } else { + $cmd =~ s/:REMOTECLUSTER:/$testCmd->{'remoteNotSecureCluster'}/g; + } + + return $cmd; +} + +sub globalSetup +{ + my ($self, $globalHash, $log) = @_; + my $subName = (caller(0))[3]; + + + # Setup the output path + my $me = `whoami`; + chomp $me; + $globalHash->{'runid'} = $me . "." . time; + + # if "-ignore false" was provided on the command line, + # it means do run tests even when marked as 'ignore' + if(defined($globalHash->{'ignore'}) && $globalHash->{'ignore'} eq 'false') + { + $self->{'ignore'} = 'false'; + } + + $globalHash->{'outpath'} = $globalHash->{'outpathbase'} . "/" . $globalHash->{'runid'} . "/"; + $globalHash->{'localpath'} = $globalHash->{'localpathbase'} . "/" . $globalHash->{'runid'} . "/"; + + # add libexec location to the path + if (defined($ENV{'PATH'})) { + $ENV{'PATH'} = $globalHash->{'scriptPath'} . ":" . $ENV{'PATH'}; + } + else { + $ENV{'PATH'} = $globalHash->{'scriptPath'}; + } + + my @cmd = ($self->getPigCmd($globalHash, $log), '-e', 'mkdir', $globalHash->{'outpath'}); + + + print $log "Going to run " . join(" ", @cmd) . "\n"; + IPC::Run::run(\@cmd, \undef, $log, $log) or die "Cannot create HDFS directory " . $globalHash->{'outpath'} . ": $? - $!\n"; + + IPC::Run::run(['mkdir', '-p', $globalHash->{'localpath'}], \undef, $log, $log) or + die "Cannot create localpath directory " . $globalHash->{'localpath'} . + " " . "$ERRNO\n"; + + IPC::Run::run(['mkdir', '-p', $globalHash->{'benchmarkPath'}], \undef, $log, $log) or + die "Cannot create benchmark directory " . $globalHash->{'benchmarkPath'} . + " " . "$ERRNO\n"; + + # Create the temporary directory + IPC::Run::run(['mkdir', '-p', $globalHash->{'tmpPath'}], \undef, $log, $log) or + die "Cannot create temporary directory " . $globalHash->{'tmpPath'} . + " " . "$ERRNO\n"; + + # Create the HDFS temporary directory + @cmd = ($self->getPigCmd($globalHash, $log), '-e', 'mkdir', "tmp/$globalHash->{'runid'}"); + print $log "Going to run " . join(" ", @cmd) . "\n"; + IPC::Run::run(\@cmd, \undef, $log, $log) or die "Cannot create HDFS directory " . $globalHash->{'outpath'} . ": $? - $!\n"; +} + +sub globalCleanup +{ + my ($self, $globalHash, $log) = @_; + + IPC::Run::run(['rm', '-rf', $globalHash->{'tmpPath'}], \undef, $log, $log) or + warn "Cannot remove temporary directory " . $globalHash->{'tmpPath'} . + " " . "$ERRNO\n"; + + # Cleanup the HDFS temporary directory + my @cmd = ($self->getPigCmd($globalHash, $log), '-e', 'fs', '-rmr', "tmp/$globalHash->{'runid'}"); + print $log "Going to run " . join(" ", @cmd) . "\n"; + IPC::Run::run(\@cmd, \undef, $log, $log) or die "Cannot create HDFS directory " . $globalHash->{'outpath'} . ": $? - $!\n"; +} + + +sub runTest +{ + my ($self, $testCmd, $log) = @_; + my $subName = (caller(0))[3]; + + # Check that we should run this test. If the current execution type + # doesn't match the execonly flag, then skip this one. + if ($self->wrongExecutionMode($testCmd)) { + print $log "Skipping test $testCmd->{'group'}" . "_" . + $testCmd->{'num'} . " since it is executed only in " . + $testCmd->{'execonly'} . " mode and we are executing in " . + $testCmd->{'exectype'} . " mode.\n"; + my %result; + return \%result; + } + + if ( $testCmd->{'hcat_prep'} ) { + Util::prepareHCat($self, $testCmd, $log); + } + # Handle the various methods of running used in + # the original TestDrivers + + if ( $testCmd->{'pig'} && $self->hasCommandLineVerifications( $testCmd, $log) ) { + return $self->runPigCmdLine( $testCmd, $log, 1); + } elsif( $testCmd->{'pig'} ){ + # If the results are written to a table run the command and then + # run a another Pig script to dump the results of the table. + my $result; + if (defined($testCmd->{'result_table'})) { + $result = $self->runPig( $testCmd, $log, 0); + my @results = (); + my @outputs = (); + if (ref($testCmd->{'result_table'}) ne 'ARRAY') { + $results[0] = $testCmd->{'result_table'}; + } else { + @results = @{$testCmd->{'result_table'}}; + } + my $stores = $self->countStores($testCmd); + + my $id = 0; # regular ouput count + for (my $i = 0; $i < @results; $i++) { + if ($results[$i] ne '?') { + my %modifiedTestCmd = %{$testCmd}; + $pigfiles[$i] = $testCmd->{'localpath'} . + $testCmd->{'group'} . "_" . $testCmd->{'num'} . + ".dumptable.$i.pig"; + $outfiles[$i] = $testCmd->{'thisResultsPath'} . "/" . + $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".$i.out"; + $tableName = $results[$i]; + $modifiedTestCmd{'num'} = $testCmd->{'num'} . "_" . $i . "_benchmark"; + $modifiedTestCmd{'pig'} = "a = load '$tableName' using org.apache.hcatalog.pig.HCatLoader(); store a into ':OUTPATH:';"; + my $r = $self->runPig(\%modifiedTestCmd, $log, 1); + $outputs[$i] = $r->{'output'}; + } else { + $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out/$id"; + my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + + # Copy result file out of hadoop + my @baseCmd = $self->getPigCmd($testCmd, $log); + my $testOut = $self->postProcessSingleOutputFile($outfile, $localdir, \@baseCmd, $testCmd, $log); + $outputs[$i] = $testOut; + $id++; + } + } + $result->{'outputs'}=\@outputs; + if ($self->countStores($testCmd)==1) { + $result->{'output'}=$outputs[0]; + } + } + else { + $result = $self->runPig( $testCmd, $log, 1); + } + return $result; + } elsif( $testCmd->{'script'} ){ + return $self->runScript( $testCmd, $log ); + } else { + die "$subName FATAL Did not find a testCmd that I know how to handle"; + } +} + +sub runPigCmdLine +{ + my ($self, $testCmd, $log) = @_; + my $subName = (caller(0))[3]; + my %result; + + # Set up file locations + my $pigfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".pig"; + my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + + my $outdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + my $stdoutfile = "$outdir/stdout"; + my $stderrfile = "$outdir/stderr"; + + mkpath( [ $outdir ] , 0, 0755) if ( ! -e outdir ); + if ( ! -e $outdir ){ + print $log "$0.$subName FATAL could not mkdir $outdir\n"; + die "$0.$subName FATAL could not mkdir $outdir\n"; + } + + # Write the pig script to a file. + my $pigcmd = $self->replaceParameters( $testCmd->{'pig'}, $outfile, $testCmd, $log ); + + open(FH, "> $pigfile") or die "Unable to open file $pigfile to write pig script, $ERRNO\n"; + print FH $pigcmd . "\n"; + close(FH); + + # Build the command + my @baseCmd = $self->getPigCmd($testCmd, $log); + my @cmd = @baseCmd; + + # Add option -l giving location for secondary logs + ##!!! Should that even be here? + my $locallog = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".log"; + push(@cmd, "-logfile"); + push(@cmd, $locallog); + + # Add pig parameters if they're provided + if (defined($testCmd->{'pig_params'})) { + # Processing :PARAMPATH: in parameters + foreach my $param (@{$testCmd->{'pig_params'}}) { + $param =~ s/:PARAMPATH:/$testCmd->{'paramPath'}/g; + } + push(@cmd, @{$testCmd->{'pig_params'}}); + } + + # Add pig file and redirections + push(@cmd, $pigfile); + my $command= join (" ", @cmd) . " 1> $stdoutfile 2> $stderrfile"; + + # Run the command + print $log "$0:$subName Going to run command: ($command)\n"; + print $log "$0:$subName STD OUT IS IN FILE ($stdoutfile)\n"; + print $log "$0:$subName STD ERROR IS IN FILE ($stderrfile)\n"; + print $log "$0:$subName PIG SCRIPT CONTAINS ($pigfile): \n$pigcmd\n"; + + my @result=`$command`; + $result{'rc'} = $? >> 8; + $result{'output'} = $outfile; + $result{'stdout'} = `cat $stdoutfile`; + $result{'stderr'} = `cat $stderrfile`; + $result{'stderr_file'} = $stderrfile; + + print $log "STD ERROR CONTAINS:\n$result{'stderr'}\n"; + + return \%result; +} + + +sub runScript +{ + my ($self, $testCmd, $log) = @_; + my $subName = (caller(0))[3]; + my %result; + + # Set up file locations + my $script = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".sh"; + my $outdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + + my $outfile = "$outdir/script.out"; + my $stdoutfile = "$outdir/script.out"; + my $stderrfile = "$outdir/script.err"; + + mkpath( [ $outdir ] , 0, 0755) if ( ! -e outdir ); + if ( ! -e $outdir ){ + print $log "$0.$subName FATAL could not mkdir $outdir\n"; + die "$0.$subName FATAL could not mkdir $outdir\n"; + } + + # Write the script to a file + my $cmd = $self->replaceParameters( $testCmd->{'script'}, $outfile, $testCmd, $log ); + + open(FH, ">$script") or die "Unable to open file $script to write script, $ERRNO\n"; + print FH $cmd . "\n"; + close(FH); + + my @result=`chmod +x $script`; + + # Build the command + my $command= "$script 1> $stdoutfile 2> $stderrfile"; + + # Run the script + print $log "$0:$subName Going to run command: ($command)\n"; + print $log "$0:$subName STD OUT IS IN FILE ($stdoutfile)\n"; + print $log "$0:$subName STD ERROR IS IN FILE ($stderrfile)\n"; + print $log "$0:$subName SCRIPT CONTAINS ($script): \n$cmd\n"; + + @result=`$command`; + $result{'rc'} = $? >> 8; + $result{'output'} = $outfile; + $result{'stdout'} = `cat $stdoutfile`; + $result{'stderr'} = `cat $stderrfile`; + $result{'stderr_file'} = $stderrfile; + + print $log "STD ERROR CONTAINS:\n$result{'stderr'}\n"; + + return \%result; +} + + +sub getPigCmd($$$) +{ + my ($self, $testCmd, $log) = @_; + + my @pigCmd; + + # set the PIG_CLASSPATH environment variable + my $pcp .= $testCmd->{'jythonjar'} if (defined($testCmd->{'jythonjar'})); + $pcp .= ":" . $testCmd->{'classpath'} if (defined($testCmd->{'classpath'})); + $pcp .= ":" . $testCmd->{'additionaljars'} if (defined($testCmd->{'additionaljars'})); + # Only add testconfigpath to PIG_CLASSPATH if HADOOP_HOME isn't defined + $pcp .= ":" . $testCmd->{'testconfigpath'} if ($testCmd->{'exectype'} ne "local"); #&& (! defined $ENV{'HADOOP_HOME'}); + + # Set it in our current environment. It will get inherited by the IPC::Run + # command. + $ENV{'PIG_CLASSPATH'} = $pcp; + + @pigCmd = ("$testCmd->{'pigpath'}/bin/pig"); + + if (defined($testCmd->{'additionaljars'})) { + push(@pigCmd, '-Dpig.additional.jars='.$testCmd->{'additionaljars'}); + } + + if ($testCmd->{'exectype'} eq "local") { + push(@{$testCmd->{'java_params'}}, "-Xmx1024m"); + push(@pigCmd, ("-x", "local")); + } + + my $opts .= "-Dhcat.metastore.uri=$testCmd->{'thriftserver'}"; + if (defined($testCmd->{'java_params'})) { + $opts = $opts . " " . join(" ", @{$testCmd->{'java_params'}}); + } + + $ENV{'PIG_OPTS'} = $opts; + + print $log "Returning Pig command " . join(" ", @pigCmd) . "\n"; + print $log "With PIG_CLASSPATH set to " . $ENV{'PIG_CLASSPATH'} . " and PIG_OPTS set to " . $ENV{'PIG_OPTS'} . "\n"; + return @pigCmd; +} + +sub dumpPigTable +{ + my ($self, $testCmd, $table, $log, $id) = @_; + my $subName = (caller(0))[3]; + + my %result; + + # Write the pig script to a file. + my $pigfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . ".dump.pig"; + my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . "dump.out"; + + open(FH, "> $pigfile") or die "Unable to open file $pigfile to write pig script, $ERRNO\n"; + print FH "a = load '$table' using org.apache.hcatalog.pig.HCatLoader(); store a into '$outfile';\n"; + close(FH); + + + # Build the command + my @baseCmd = $self->getPigCmd($testCmd, $log); + my @cmd = @baseCmd; + + push(@cmd, $pigfile); + + + # Run the command + print $log "Setting PIG_CLASSPATH to $ENV{'PIG_CLASSPATH'}\n"; + print $log "$0::$className::$subName INFO: Going to run pig command: @cmd\n"; + + IPC::Run::run(\@cmd, \undef, $log, $log) or + die "Failed running $pigfile\n"; + $result{'rc'} = $? >> 8; + + + # Get results from the command locally + my $localoutfile; + my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . $id . ".dump.out"; + my $stores = $self->countStores($testCmd); + + $outfile = $self->postProcessSingleOutputFile($outfile, $localdir, \@baseCmd, $testCmd, $log); + return $outfile; +} + +sub runPig +{ + my ($self, $testCmd, $log, $copyResults) = @_; + my $subName = (caller(0))[3]; + + my %result; + + # Write the pig script to a file. + my $pigfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".pig"; + my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + + my $pigcmd = $self->replaceParameters( $testCmd->{'pig'}, $outfile, $testCmd, $log ); + + open(FH, "> $pigfile") or die "Unable to open file $pigfile to write pig script, $ERRNO\n"; + print FH $pigcmd . "\n"; + close(FH); + + + # Build the command + my @baseCmd = $self->getPigCmd($testCmd, $log); + my @cmd = @baseCmd; + + # Add option -l giving location for secondary logs + my $locallog = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".log"; + push(@cmd, "-logfile"); + push(@cmd, $locallog); + + # Add pig parameters if they're provided + if (defined($testCmd->{'pig_params'})) { + # Processing :PARAMPATH: in parameters + foreach my $param (@{$testCmd->{'pig_params'}}) { + $param =~ s/:PARAMPATH:/$testCmd->{'paramPath'}/g; + } + push(@cmd, @{$testCmd->{'pig_params'}}); + } + + push(@cmd, $pigfile); + + + # Run the command + print $log "Setting PIG_CLASSPATH to $ENV{'PIG_CLASSPATH'}\n"; + print $log "$0::$className::$subName INFO: Going to run pig command: @cmd\n"; + + IPC::Run::run(\@cmd, \undef, $log, $log) or + die "Failed running $pigfile\n"; + $result{'rc'} = $? >> 8; + + + # Get results from the command locally + my $localoutfile; + my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + my $stores = $self->countStores($testCmd); + + # single query + if ($stores == 1) { + if ($copyResults) { + $result{'output'} = $self->postProcessSingleOutputFile($outfile, $localdir, \@baseCmd, $testCmd, $log); + $result{'originalOutput'} = "$localdir/out_original"; # populated by postProcessSingleOutputFile + } else { + $result{'output'} = "NO_COPY"; + } + } + # multi query + else { + my @outfiles = (); + for (my $id = 1; $id <= ($stores); $id++) { + $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out/$id"; + $localoutfile = $outfile . ".$id"; + + # Copy result file out of hadoop + my $testOut; + if ($copyResults) { + $testOut = $self->postProcessSingleOutputFile($localoutfile, $localdir, \@baseCmd, $testCmd, $log); + } else { + $testOut = "NO_COPY"; + } + push(@outfiles, $testOut); + } + ##!!! originalOutputs not set! Needed? + $result{'outputs'} = \@outfiles; + } + + # Compare doesn't get the testCmd hash, so I need to stuff the necessary + # info about sorting into the result. + if (defined $testCmd->{'sortArgs'} && $testCmd->{'sortArgs'}) { + $result{'sortArgs'} = $testCmd->{'sortArgs'}; + } + + return \%result; +} + +sub postProcessSingleSQLOutputFile +{ + my ($self, $outfile, $testCmd, $log, $isBenchmark) = @_; + + # If requested, process the data to smooth over floating point + # differences. + if (defined $testCmd->{'floatpostprocess'} && + defined $testCmd->{'delimiter'}) { + # Move the file to a temp file and run through the pre-processor. + my $tmpfile = "$outfile.tmp"; + link($outfile, $tmpfile) or + die "Unable to create temporary file $tmpfile, $!\n"; + unlink($outfile) or + die "Unable to unlink file $outfile, $!\n"; + open(IFH, "< $tmpfile") or + die "Unable to open file $tmpfile, $!\n"; + open(OFH, "> $outfile") or + die "Unable to open file $outfile, $!\n"; + my @cmd = ("$toolpath/floatpostprocessor.pl", + $testCmd->{'delimiter'}); + print $log "Going to run [" . join(" ", @cmd) . "]\n"; + IPC::Run::run(\@cmd, \*IFH, \*OFH, $log) or + die "Failed to run float postprocessor, $!\n"; + close(IFH); + close(OFH); + unlink($tmpfile); + } + + if ($isBenchmark && defined $testCmd->{'nullpostprocess'}) { + # Move the file to a temp file and run through the pre-processor. + my $tmpfile = "$outfile.tmp"; + link($outfile, $tmpfile) or + die "Unable to create temporary file $tmpfile, $!\n"; + unlink($outfile) or + die "Unable to unlink file $outfile, $!\n"; + open(IFH, "< $tmpfile") or + die "Unable to open file $tmpfile, $!\n"; + open(OFH, "> $outfile") or + die "Unable to open file $outfile, $!\n"; + my @cmd = ("sed", "s/NULL//g"); + print $log "Going to run [" . join(" ", @cmd) . "]\n"; + IPC::Run::run(\@cmd, \*IFH, \*OFH, $log) or + die "Failed to run float postprocessor, $!\n"; + close(IFH); + close(OFH); + unlink($tmpfile); + } + + # Sort the results for the benchmark compare. + my $sortfile = "$outfile.sorted"; + my @cmd = ("sort", $outfile); + print $log "Going to run [" . join(" ", @cmd) . "]\n"; + IPC::Run::run(\@cmd, '>', "$sortfile"); + + return $sortfile; +} + +sub postProcessSingleOutputFile +{ + my ($self, $outfile, $localdir, $baseCmd, $testCmd, $log) = @_; + my $subName = (caller(0))[3]; + + my @baseCmd = @{$baseCmd}; + my @copyCmd = @baseCmd; + push(@copyCmd, ('-e', 'copyToLocal', $outfile, $localdir)); + print $log "$0::$className::$subName INFO: Going to run pig command: @copyCmd\n"; + + IPC::Run::run(\@copyCmd, \undef, $log, $log) or die "Cannot copy results from HDFS $outfile to $localdir\n"; + + + # Sort the result if necessary. Keep the original output in one large file. + # Use system not IPC run so that the '*' gets interpolated by the shell. + + # Build command to: + # 1. Combine part files + my $fppCmd = "cat $localdir/map* $localdir/part* 2>/dev/null"; + + # 2. Standardize float precision + if (defined $testCmd->{'floatpostprocess'} && + defined $testCmd->{'delimiter'}) { + $fppCmd .= " | $toolpath/floatpostprocessor.pl '" . + $testCmd->{'delimiter'} . "'"; + } + + $fppCmd .= " > $localdir/out_original"; + + # run command + print $log "$fppCmd\n"; + system($fppCmd); + + # Sort the results for the benchmark compare. + my @sortCmd = ('sort', "$localdir/out_original"); + print $log join(" ", @sortCmd) . "\n"; + IPC::Run::run(\@sortCmd, '>', "$localdir/out_sorted"); + + return "$localdir/out_sorted"; +} + +sub generateBenchmark +{ + my ($self, $testCmd, $log) = @_; + + my %result; + + my @SQLQuery = @{$testCmd->{'sql'}}; + my @SQLQuery = (); + if (ref($testCmd->{'sql'}) ne 'ARRAY') { + $SQLQuery[0] = $testCmd->{'sql'}; + } else { + @SQLQuery = @{$testCmd->{'sql'}}; + } + + if ($#SQLQuery == 0) { + my $outfile = $self->generateSingleSQLBenchmark($testCmd, $SQLQuery[0], undef, $log); + $result{'output'} = $outfile; + } else { + my @outfiles = (); + for (my $id = 0; $id < ($#SQLQuery + 1); $id++) { + my $sql = $SQLQuery[$id]; + my $outfile = $self->generateSingleSQLBenchmark($testCmd, $sql, ($id+1), $log); + push(@outfiles, $outfile); + } + $result{'outputs'} = \@outfiles; + } + + return \%result; +} + +sub generateSingleSQLBenchmark +{ + my ($self, $testCmd, $sql, $id, $log) = @_; + + my $qmd5 = substr(md5_hex($testCmd->{'pig'}), 0, 5); + my $sqlfile = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".benchmark.$id.sql"; + my $outfile = $testCmd->{'benchmarkPath'} . "/" . $testCmd->{'group'} . "_" . $testCmd->{'num'}; + + $outfile .= defined($id) ? ".$id" . ".out" : ".out"; + + my $outfp; + open($outfp, "> $outfile") or + die "Unable to open output file $outfile, $!\n"; + + open(FH, "> $sqlfile") or + die "Unable to open file $sqlfile to write SQL script, $ERRNO\n"; + print FH $sql; + close(FH); + + Util::runDbCmd($testCmd, $log, $sqlfile, $outfp); + + $rcs[$i] = $? >> 8; + close($outfp); + + my $localdir = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + + $outfile = + $self->postProcessSingleSQLOutputFile($outfile, $testCmd, $log); + + return $outfile; +} + +sub hasCommandLineVerifications +{ + my ($self, $testCmd, $log) = @_; + + foreach my $key ('rc', 'expected_out', 'expected_out_regex', 'expected_err', 'expected_err_regex', + 'not_expected_out', 'not_expected_out_regex', 'not_expected_err', 'not_expected_err_regex' ) { + if (defined $testCmd->{$key}) { + return 1; + } + } + return 0; +} + + +sub compare +{ + my ($self, $testResult, $benchmarkResult, $log, $testCmd) = @_; + my $subName = (caller(0))[3]; + + # Check that we should run this test. If the current execution type + # doesn't match the execonly flag, then skip this one. + if ($self->wrongExecutionMode($testCmd)) { + # Special magic value + return $self->{'wrong_execution_mode'}; + } + + # For now, if the test has + # - testCmd pig, and 'sql' for benchmark, then use comparePig, i.e. using benchmark + # - any verification directives formerly used by CmdLine or Script drivers (rc, regex on out and err...) + # then use compareScript even if testCmd is "pig" + # - testCmd script, then use compareScript + # - testCmd pig, and none of the above, then use comparePig + # + # Later, should add ability to have same tests both verify with the 'script' directives, + # and do a benchmark compare, if it was a pig cmd. E.g. 'rc' could still be checked when + # doing the benchmark compare. + + if ( $testCmd->{'script'} || $self->hasCommandLineVerifications( $testCmd, $log) ){ + return $self->compareScript ( $testResult, $log, $testCmd); + } elsif( $testCmd->{'pig'} ){ + return $self->comparePig ( $testResult, $benchmarkResult, $log, $testCmd); + } else { + # Should have been caught by runTest, still... + print $log "$0.$subName WARNING Did not find a testCmd that I know how to handle\n"; + return 0; + } +} + + +sub compareScript +{ + my ($self, $testResult, $log, $testCmd) = @_; + my $subName = (caller(0))[3]; + + + # IMPORTANT NOTES: + # + # If you are using a regex to compare stdout or stderr + # and if the pattern that you are trying to match spans two line + # explicitly use '\n' (without the single quotes) in the regex + # + # If any verification directives are added here + # do remember also to add them to the hasCommandLineVerifications subroutine. + # + # If the test conf file misspells the directive, you won't be told... + # + + my $result = 1; # until proven wrong... + + + # Return Code + if (defined $testCmd->{'rc'}) { + print $log "$0::$subName INFO Checking return code " . + "against expected <$testCmd->{'rc'}>\n"; + if ( (! defined $testResult->{'rc'}) || ($testResult->{'rc'} != $testCmd->{'rc'})) { + print $log "$0::$subName INFO Check failed: rc = <$testCmd->{'rc'}> expected, test returned rc = <$testResult->{'rc'}>\n"; + $result = 0; + } + } + + # Standard Out + if (defined $testCmd->{'expected_out'}) { + print $log "$0::$subName INFO Checking test stdout' " . + "as exact match against expected <$testCmd->{'expected_out'}>\n"; + if ($testResult->{'stdout'} ne $testCmd->{'expected_out'}) { + print $log "$0::$subName INFO Check failed: exact match of <$testCmd->{'expected_out'}> expected in stdout: $testResult->{'stdout'}\n"; + $result = 0; + } + } + + if (defined $testCmd->{'not_expected_out'}) { + print $log "$0::$subName INFO Checking test stdout " . + "as NOT exact match against expected <$testCmd->{'expected_out'}>\n"; + if ($testResult->{'stdout'} eq $testCmd->{'not_expected_out'}) { + print $log "$0::$subName INFO Check failed: NON-match of <$testCmd->{'expected_out'}> expected to stdout: $testResult->{'stdout'}\n"; + $result = 0; + } + } + + if (defined $testCmd->{'expected_out_regex'}) { + print $log "$0::$subName INFO Checking test stdout " . + "for regular expression <$testCmd->{'expected_out_regex'}>\n"; + if ($testResult->{'stdout'} !~ $testCmd->{'expected_out_regex'}) { + print $log "$0::$subName INFO Check failed: regex match of <$testCmd->{'expected_out_regex'}> expected in stdout: $testResult->{'stdout'}\n"; + $result = 0; + } + } + + if (defined $testCmd->{'not_expected_out_regex'}) { + print $log "$0::$subName INFO Checking test stdout " . + "for NON-match of regular expression <$testCmd->{'not_expected_out_regex'}>\n"; + if ($testResult->{'stdout'} =~ $testCmd->{'not_expected_out_regex'}) { + print $log "$0::$subName INFO Check failed: regex NON-match of <$testCmd->{'not_expected_out_regex'}> expected in stdout: $testResult->{'stdout'}\n"; + $result = 0; + } + } + + # Standard Error + if (defined $testCmd->{'expected_err'}) { + print $log "$0::$subName INFO Checking test stderr " . + "as exact match against expected <$testCmd->{'expected_err'}>\n"; + if ($testResult->{'stderr'} ne $testCmd->{'expected_err'}) { + print $log "$0::$subName INFO Check failed: exact match of <$testCmd->{'expected_err'}> expected in stderr: $testResult->{'stderr_file'}\n"; + $result = 0; + } + } + + if (defined $testCmd->{'not_expected_err'}) { + print $log "$0::$subName INFO Checking test stderr " . + "as NOT an exact match against expected <$testCmd->{'expected_err'}>\n"; + if ($testResult->{'stderr'} eq $testCmd->{'not_expected_err'}) { + print $log "$0::$subName INFO Check failed: NON-match of <$testCmd->{'expected_err'}> expected to stderr: $testResult->{'stderr_file'}\n"; + $result = 0; + } + } + + if (defined $testCmd->{'expected_err_regex'}) { + print $log "$0::$subName INFO Checking test stderr " . + "for regular expression <$testCmd->{'expected_err_regex'}>\n"; + if ($testResult->{'stderr'} !~ $testCmd->{'expected_err_regex'}) { + print $log "$0::$subName INFO Check failed: regex match of <$testCmd->{'expected_err_regex'}> expected in stderr: $testResult->{'stderr_file'}\n"; + $result = 0; + } + } + + if (defined $testCmd->{'not_expected_err_regex'}) { + print $log "$0::$subName INFO Checking test stderr " . + "for NON-match of regular expression <$testCmd->{'not_expected_err_regex'}>\n"; + if ($testResult->{'stderr'} =~ $testCmd->{'not_expected_err_regex'}) { + print $log "$0::$subName INFO Check failed: regex NON-match of <$testCmd->{'not_expected_err_regex'}> expected in stderr: $testResult->{'stderr_file'}\n"; + $result = 0; + } + } + + return $result; +} + + +sub comparePig +{ + my ($self, $testResult, $benchmarkResult, $log, $testCmd) = @_; + my $subName = (caller(0))[3]; + + my $result; + my $stores = $self->countStores($testCmd); + + if ($stores == 1) { + $result = $self->compareSingleOutput($testResult, $testResult->{'output'}, + $benchmarkResult->{'output'}, $log); + } else { + my $res = 0; + for (my $id = 0; $id < ($stores); $id++) { + my $testOutput = ($testResult->{'outputs'})->[$id]; + my $benchmarkOutput = ($benchmarkResult->{'outputs'})->[$id]; + $res += $self->compareSingleOutput($testResult, $testOutput, + $benchmarkOutput, $log); + $result = ($res == ($stores)) ? 1 : 0; + } + } + + return $result; +} + + +sub compareSingleOutput +{ + my ($self, $testResult, $testOutput, $benchmarkOutput, $log) = @_; + +print $log "testResult: $testResult testOutput: $testOutput benchmarkOutput: $benchmarkOutput\n"; + + # cksum the the two files to see if they are the same + my ($testChksm, $benchmarkChksm); + IPC::Run::run((['cat', $testOutput], '|', ['cksum']), \$testChksm, + $log) or die "$0: error: cannot run cksum on test results\n"; + IPC::Run::run((['cat', $benchmarkOutput], '|', ['cksum']), + \$benchmarkChksm, $log) or die "$0: error: cannot run cksum on benchmark\n"; + + chomp $testChksm; + chomp $benchmarkChksm; + print $log "test cksum: $testChksm\nbenchmark cksum: $benchmarkChksm\n"; + + my $result; + if ($testChksm ne $benchmarkChksm) { + print $log "Test output checksum does not match benchmark checksum\n"; + print $log "Test checksum = <$testChksm>\n"; + print $log "Expected checksum = <$benchmarkChksm>\n"; + print $log "RESULTS DIFFER: vimdiff " . cwd . "/$testOutput " . cwd . "/$benchmarkOutput\n"; + } else { + $result = 1; + } + + # Now, check if the sort order is specified + if (defined($testResult->{'sortArgs'})) { + Util::setLocale(); + my @sortChk = ('sort', '-cs'); + push(@sortChk, @{$testResult->{'sortArgs'}}); + push(@sortChk, $testResult->{'originalOutput'}); + print $log "Going to run sort check command: " . join(" ", @sortChk) . "\n"; + IPC::Run::run(\@sortChk, \undef, $log, $log); + my $sortrc = $?; + if ($sortrc) { + print $log "Sort check failed\n"; + $result = 0; + } + } + + return $result; +} + +############################################################################## +# Count the number of stores in a Pig Latin script, so we know how many files +# we need to compare. +# +sub countStores($$) +{ + my ($self, $testCmd) = @_; + + # Special work around for queries with more than one store that are not + # actually multiqueries. + if (defined $testCmd->{'notmq'}) { + return 1; + } + + my $count; + + # hope they don't have more than store per line + # also note that this won't work if you comment out a store + my @q = split(/\n/, $testCmd->{'pig'}); + for (my $i = 0; $i < @q; $i++) { + $count += $q[$i] =~ /store\s+[a-zA-Z][a-zA-Z0-9_]*\s+into/i; + } + + return $count; +} + +############################################################################## +# Check whether we should be running this test or not. +# +sub wrongExecutionMode($$) +{ + my ($self, $testCmd) = @_; + + # Check that we should run this test. If the current execution type + # doesn't match the execonly flag, then skip this one. + return (defined $testCmd->{'execonly'} && + $testCmd->{'execonly'} ne $testCmd->{'exectype'}); +} + +############################################################################## +# Sub: printGroupResultsXml +# Print the results for the group using junit xml schema using values from the testStatuses hash. +# +# Paramaters: +# $report - the report object to use to generate the report +# $groupName - the name of the group to report totals for +# $testStatuses - the hash containing the results for the tests run so far +# $totalDuration- The total time it took to run the group of tests +# +# Returns: +# None. +# +sub printGroupResultsXml +{ + my ( $report, $groupName, $testStatuses, $totalDuration) = @_; + $totalDuration=0 if ( !$totalDuration ); + + my ($pass, $fail, $abort, $depend) = (0, 0, 0, 0); + + foreach my $key (keys(%$testStatuses)) { + if ( $key =~ /^$groupName/ ){ + ($testStatuses->{$key} eq $passedStr) && $pass++; + ($testStatuses->{$key} eq $failedStr) && $fail++; + ($testStatuses->{$key} eq $abortedStr) && $abort++; + ($testStatuses->{$key} eq $dependStr) && $depend++; + } + } + + my $total= $pass + $fail + $abort; + $report->totals( $groupName, $total, $fail, $abort, $totalDuration ); + +} + +1; Index: src/test/e2e/hcatalog/drivers/Util.pm =================================================================== --- src/test/e2e/hcatalog/drivers/Util.pm (revision 1208047) +++ src/test/e2e/hcatalog/drivers/Util.pm (working copy) @@ -26,7 +26,290 @@ package Util; +use IPC::Run qw(run); +use Log::Log4perl qw(:easy); + +sub prepareHCat +{ + my ($self, $testCmd, $log) = @_; + my $outfile = $testCmd->{'outpath'} . $testCmd->{'group'} . "_" . $testCmd->{'num'} . ".out"; + my $hcatCmd = $self->replaceParameters( $testCmd->{'hcat_prep'}, $outfile, $testCmd, $log); + + my @hivefiles = (); + my @outfiles = (); + # Write the hive script to a file. + $hivefiles[0] = $testCmd->{'localpath'} . $testCmd->{'group'} . "_" . + $testCmd->{'num'} . ".0.sql"; + $outfiles[0] = $testCmd->{'thisResultsPath'} . "/" . $testCmd->{'group'} . + "_" . $testCmd->{'num'} . ".0.out"; + + open(FH, "> $hivefiles[0]") or + die "Unable to open file $hivefiles[0] to write SQL script, $ERRNO\n"; + print FH $testCmd->{'hcat_prep'} . "\n"; + close(FH); + + Util::runHCatCmdFromFile($testCmd, $log, $hivefiles[0]); +} + ############################################################################## +# Sub: setupHiveProperties +# +# Assure that necessary values are set in config in order to set Hive +# Java properties. +# +# Returns: +# Nothing +sub setupHiveProperties($$) +{ + my ($cfg, $log) = @_; + + # Set up values for the metastore + if (defined($cfg->{'metastore_thrift'}) && $cfg->{'metastore_thrift'} == 1) { + if (! defined $cfg->{'metastore_host'} || $cfg->{'metastore_host'} eq "") { + print $log "When using thrift, you must set the key " . + " 'metastore_host' to the machine your metastore is on\n"; + die "metastore_host is not set in existing.conf\n"; + } + + $cfg->{'metastore_connection'} = + "jdbc:$cfg->{'metastore_db'}://$cfg->{'metastore_host'}/hivemetastoredb?createDatabaseIfNotExist=true"; + + if (! defined $cfg->{'metastore_passwd'} || $cfg->{'metastore_passwd'} eq "") { + $cfg->{'metastore_passwd'} = 'hive'; + } + + if (! defined $cfg->{'metastore_port'} || $cfg->{'metastore_port'} eq "") { + $cfg->{'metastore_port'} = '9933'; + } + + $cfg->{'metastore_uri'} = + "thrift://$cfg->{'metastore_host'}:$cfg->{'metastore_port'}"; + } else { + $cfg->{'metastore_connection'} = + "jdbc:derby:;databaseName=metastore_db;create=true"; + $cfg->{'metastore_driver'} = "org.apache.derby.jdbc.EmbeddedDriver"; + } +} + +sub getHadoopCmd +{ + my ( $properties ) = @_; + + my $subName = (caller(0))[3]; + my @baseCmd; + + die "$0.$subName: null properties" if (! $properties ); + + my $cmd; + + $cmd = $properties->{'hadoopbin'}; + if ( ! -x "$cmd" ) { + print STDERR "\n$0::$subName WARNING: Can't find hadoop command: $cmd\n"; + $cmd = `which hadoop`; + chomp $cmd; + print STDERR "$0::$subName WARNING: Instead using command: $cmd\n"; + } + if ( ! -x "$cmd" ) { + die "\n$0::$subName FATAL: Hadoop command does not exist: $cmd\n"; + } + push (@baseCmd, $cmd); + + push (@baseCmd, '--config', $properties->{'testconfigpath'}) if defined($properties->{'testconfigpath'}); + + return @baseCmd; +} + +############################################################################## +# Sub: runHiveCmdFromFile +# +# Run the provided file using the Hive command line. +# +# cfg - The configuration file for the test +# log - reference to the log file, should be an open file pointer +# sql - name of file containing SQL to run. Optional, if present -f $sql +# will be appended to the command. +# outfile - open file pointer (or variable reference) to write stdout to for +# this test. Optional, will be written to $log if this value is not +# provided. +# outfile - open file pointer (or variable reference) to write stderr to for +# this test. Optional, will be written to $log if this value is not +# provided. +# noFailOnFail - if true, do not fail when the Hive command returns non-zero +# value. +# Returns: +# Nothing +sub runHiveCmdFromFile($$;$$$$) +{ + my ($cfg, $log, $sql, $outfile, $errfile, $noFailOnFail) = @_; + + if (!defined($ENV{'HADOOP_HOME'})) { + die "Cannot run hive when HADOOP_HOME environment variable is not set."; + } + + $outfile = $log if (!defined($outfile)); + $errfile = $log if (!defined($errfile)); + + my @cmd = ("$cfg->{'hivehome'}/bin/hive"); + + # Add all of the modified properties we want to set + push(@cmd, "--hiveconf", "hive.metastore.uris=$cfg->{'thriftserver'}"); + push(@cmd, "--hiveconf", "hive.metastore.local=false"); + + if( defined($cfg->{'metastore.principal'}) && ($cfg->{'metastore.principal'} =~ m/\S+/) + && ($cfg->{'metastore.principal'} ne '${metastore.principal}')){ + push(@cmd, "--hiveconf", "hive.metastore.sasl.enabled=true", "--hiveconf", "hive.metastore.kerberos.principal=$cfg->{'metastore.principal'}"); + } else { + push(@cmd, "--hiveconf", "hive.metastore.sasl.enabled=false"); + } + + if (defined($cfg->{'additionaljarspath'})) { + $ENV{'HIVE_AUX_JARS_PATH'} = $cfg->{'additionaljarspath'}; + } + + if (defined($cfg->{'hiveconf'})) { + foreach my $hc (@{$cfg->{'hiveconf'}}) { + push(@cmd, "--hiveconf", $hc); + } + } + + if (defined($cfg->{'hivecmdargs'})) { + push(@cmd, @{$cfg->{'hivecmdargs'}}); + } + + if (defined($cfg->{'hiveops'})) { + $ENV{'HIVE_OPTS'} = join(" ", @{$cfg->{'hiveops'}}); + } + + $ENV{'HIVE_HOME'} = $cfg->{'hivehome'}; + + my $envStr; + for my $k (keys(%ENV)) { + $envStr .= $k . "=" . $ENV{$k} . " " if ($k =~ /HADOOP/ || $k =~ /HIVE/); + } + $envStr .= " "; + + if (defined($sql)) { + push(@cmd, "-f", $sql); + } + print $log "Going to run hive command [" . join(" ", @cmd) . + "] with environment set to [$envStr]\n"; + my $runrc = run(\@cmd, \undef, $outfile, $errfile); + my $rc = $? >> 8; + + return $runrc if $runrc; # success + + if (defined($noFailOnFail) && $noFailOnFail) { + return $rc; + } else { + die "Failed running hive command [" . join(" ", @cmd) . "]\n"; + } +} + +############################################################################# +# Sub: runHiveCmdFromFile +# +# Run the provided file using the Hive command line. +# +# cfg - The configuration file for the test +# log - reference to the log file, should be an open file pointer +# sql - name of file containing SQL to run. Optional, if present -f $sql +# will be appended to the command. +# outfile - open file pointer (or variable reference) to write stdout to for +# this test. Optional, will be written to $log if this value is not +# provided. +# outfile - open file pointer (or variable reference) to write stderr to for +# this test. Optional, will be written to $log if this value is not +# provided. +# noFailOnFail - if true, do not fail when the Hive command returns non-zero +# value. +# Returns: +# Nothing +sub runHCatCmdFromFile($$;$$$$) +{ + my ($cfg, $log, $sql, $outfile, $errfile, $noFailOnFail) = @_; + + if (!defined($ENV{'HADOOP_HOME'})) { + die "Cannot run hive when HADOOP_HOME environment variable is not set."; + } + + $outfile = $log if (!defined($outfile)); + $errfile = $log if (!defined($errfile)); + + # unset HADOOP_CLASSPATH + $ENV{'HADOOP_CLASSPATH'} = ""; + + my @cmd; + if (defined($sql)) { + @cmd = ("$cfg->{'hcathome'}/bin/hcat", "-f", $sql); + } else { + @cmd = ("$cfg->{'hcathome'}/bin/hcat"); + } + + my $envStr; + for my $k (keys(%ENV)) { + $envStr .= $k . "=" . $ENV{$k} . " " if ($k =~ /HADOOP/ || $k =~ /HIVE/); + } + $envStr .= " "; + print $log "Going to run hcat command [" . join(" ", @cmd) . + "] with environment set to [$envStr]\n"; + my $runrc = run(\@cmd, \undef, $outfile, $errfile); + my $rc = $? >> 8; + + return $runrc if $runrc; # success + + if (defined($noFailOnFail) && $noFailOnFail) { + return $rc; + } else { + die "Failed running hcat command [" . join(" ", @cmd) . "]\n"; + } +} + +############################################################################## +# Sub: runDbCmd +# +# Run the provided mysql command +# +# Returns: +# Nothing +sub runDbCmd($$$;$) +{ + my ($cfg, $log, $sqlfile, $outfile) = @_; + + $outfile = $log if (!defined($outfile)); + + open(SQL, "< $sqlfile") or die "Unable to open $sqlfile for reading, $!\n"; + + my @cmd = ('mysql', '-u', $cfg->{'dbuser'}, '-D', $cfg->{'dbdb'}, + '-h', $cfg->{'dbhost'}, "--password=$cfg->{'dbpasswd'}", + "--skip-column-names"); + + print $log "Going to run [" . join(" ", @cmd) . "] passing in [$sqlfile]\n"; + + run(\@cmd, \*SQL, $outfile, $log) or + die "Failed running " . join(" ", @cmd) . "\n"; + close(SQL); +} + +# Sub: runHadoopCmd +# +# Run the provided hadoop command +# +# Returns: +# Nothing +sub runHadoopCmd($$$) +{ + my ($cfg, $log, $c) = @_; + + my @cmd = ("$ENV{'HADOOP_HOME'}/bin/hadoop"); + push(@cmd, split(' ', $c)); + + print $log "Going to run [" . join(" ", @cmd) . "]\n"; + + run(\@cmd, \undef, $log, $log) or + die "Failed running " . join(" ", @cmd) . "\n"; +} + +############################################################################## # Sub: localTime # # Returns: @@ -111,35 +394,6 @@ return @result; } -sub getHadoopCmd -{ - my ( $properties ) = @_; - - my $subName = (caller(0))[3]; - my @baseCmd; - - die "$0.$subName: null properties" if (! $properties ); - - my $cmd; - - $cmd = $properties->{'gridstack.root'} . "/hadoop/current/bin/hadoop"; - if ( ! -x "$cmd" ) { - print STDERR "\n$0::$subName WARNING: Can't find hadoop command: $cmd\n"; - $cmd = `which hadoop`; - chomp $cmd; - print STDERR "$0::$subName WARNING: Instead using command: $cmd\n"; - } - if ( ! -x "$cmd" ) { - die "\n$0::$subName FATAL: Hadoop command does not exist: $cmd\n"; - } - push (@baseCmd, $cmd); - - push (@baseCmd, '--config', $properties->{'testconfigpath'}) if defined($properties->{'testconfigpath'}); - - return @baseCmd; -} - - sub getHiveCmd { my ( $properties ) = @_; @@ -187,6 +441,17 @@ return @baseCmd; } +sub show_call_stack { + my ( $path, $line, $subr ); + my $max_depth = 30; + my $i = 1; + print("--- Begin stack trace ---"); + while ( (my @call_details = (caller($i++))) && ($i<$max_depth) ) { + print("$call_details[1] line $call_details[2] in function $ ++call_details[3]"); + print("--- End stack trace ---"); + } +} sub getPigCmd @@ -198,6 +463,7 @@ my @baseCmd; die "$0.$subName: null properties" if (! $properties ); +show_call_stack(); #UGLY HACK for pig sql support if ( $jarkey =~ /testsql/ ) { @@ -217,6 +483,7 @@ # This allows for testing of the pig script as installed, and for testin of # the pig script's options, including error testing. +print 'use-pig.pl?????'; $cmd = $properties->{'gridstack.root'} . "/pig/" . $properties->{'pigTestBuildName'} . "/bin/pig"; if ( ! -x "$cmd" ) { @@ -241,6 +508,7 @@ } else { $cmd="java"; +print 'not use-pig.pl?????'; # Set JAVA options # User can provide only one of Index: src/test/e2e/hcatalog/build.xml =================================================================== --- src/test/e2e/hcatalog/build.xml (revision 1208047) +++ src/test/e2e/hcatalog/build.xml (working copy) @@ -40,6 +40,7 @@ + @@ -56,12 +57,38 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -103,6 +130,7 @@ + @@ -123,16 +151,16 @@ - - - + + - + @@ -156,19 +184,29 @@ + + + - + + + + + + + + - + @@ -184,21 +222,22 @@ + + - - + + - @@ -214,6 +253,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +