Description
– load the QID_CT_QP20 data
x = LOAD '$FS_TD/$QID_IN_FILES' USING PigStorage('\t') AS (unstem_qid:chararray, jid_score_pairs:chararray);
DESCRIBE x;
--ILLUSTRATE x;
– load the ID_RQ data
y0 = LOAD '$FS_USER/$ID_RQ_IN_FILE' USING PigStorage('\t') AS (sid:chararray, query:chararray);
– force parallelization
– y1 = ORDER y0 BY sid PARALLEL $NUM;
– compute unstem_qid
DEFINE f `text_streamer_query j3_unicode.dat prop.dat normal.txt TAB TAB 1:yes:UNSTEM_ID:%llx` INPUT(stdin USING PigStorage('\t')) OU\
TPUT(stdout USING PigStorage('\t')) SHIP('$USER/text_streamer_query', '$USER/j3_unicode.dat', '$USER/prop.dat', '$USER/normal.txt');
y = STREAM y0 THROUGH f AS (sid:chararray, query:chararray, unstem_qid:chararray);
DESCRIBE y;
--ILLUSTRATE y;
rmf /user/vega/zoom/y_debug
STORE y INTO '/user/vega/zoom/y_debug' USING PigStorage('\t');
2009-10-30 13:36:48,437 [main] INFO org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: hdfs://dd-9c32d03:8887/,/teoma/dd-9c34d04/middleware/hadoop.test.data/dfs/name
09/10/30 13:36:48 INFO executionengine.HExecutionEngine: Connecting to hadoop file system at: hdfs://dd-9c32d03:8887/,/teoma/dd-9c34d04/middleware/hadoop.test.data/dfs/name
2009-10-30 13:36:48,495 [main] INFO org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to map-reduce job tracker at: dd-9c32d04:8889
09/10/30 13:36:48 INFO executionengine.HExecutionEngine: Connecting to map-reduce job tracker at: dd-9c32d04:8889
2009-10-30 13:36:49,242 [main] ERROR org.apache.pig.tools.grunt.Grunt - ERROR 2999: Unexpected internal error. null
09/10/30 13:36:49 ERROR grunt.Grunt: ERROR 2999: Unexpected internal error. null
Details at logfile: /disk1/vega/zoom/pig_1256909801304.log