Index: data/files/text-en.txt =================================================================== --- data/files/text-en.txt (revision 0) +++ data/files/text-en.txt (revision 0) @@ -0,0 +1,78 @@ +One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections. The bedding was hardly able to cover it and seemed ready to slide off any moment. His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked. +"What's happened to me?" he thought. It wasn't a dream. His room, a proper human room although a little too small, lay peacefully between its four familiar walls. +A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame. +It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm towards the viewer. +Gregor then turned to look out the window at the dull weather. +Drops of rain could be heard hitting the pane, which made him feel quite sad. "How about if I sleep a little bit longer and forget all this nonsense", he thought, but that was something he was unable to do because he was used to sleeping on his right, and in his present state couldn't get into that position. However hard he threw himself onto his right, he always rolled back to where he was. +He must have tried it a hundred times, shut his eyes so that he wouldn't have to look at the floundering legs, and only stopped when he began to feel a mild, dull pain there that he had never felt before. +"Oh, God", he thought, "what a strenuous career it is that I've +chosen! Travelling day in and day out. Doing business like this +takes much more effort than doing your own business at home, and on +top of that there's the curse of travelling, worries about making +train connections, bad and irregular food, contact with different +people all the time so that you can never get to know anyone or +become friendly with them. It can all go to Hell!" He felt a +slight itch up on his belly; pushed himself slowly up on his back +towards the headboard so that he could lift his head better; found +where the itch was, and saw that it was covered with lots of little +white spots which he didn't know what to make of; and when he tried +to feel the place with one of his legs he drew it quickly back because as soon as he touched it he was overcome by a cold shudder. He slid back into his former position. "Getting up early all the time", he thought, "it makes you stupid. You've got to get enough sleep. Other travelling salesmen live a life of luxury. For instance, whenever I go back to the guest house during the morning to copy out the contract, these gentlemen are always still sitting there eating their breakfasts. I ought to just try that with my +boss; I'd get kicked out on the spot. But who knows, maybe that +would be the best thing for me. If I didn't have my parents to +think about I'd have given in my notice a long time ago, I'd have +gone up to the boss and told him just what I think, tell him +everything I would, let him know just what I feel. He'd fall right +off his desk! And it's a funny sort of business to be sitting up +there at your desk, talking down at your subordinates from up there, +especially when you have to go right up close because the boss is +hard of hearing. Well, there's still some hope; once I've got the +money together to pay off my parents' debt to him - another five or +six years I suppose - that's definitely what I'll do. That's when +I'll make the big change. First of all though, I've got to get up, +my train leaves at five." +One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections. The bedding was hardly able to cover it and seemed ready to slide off any moment. His many legs, pitifully thin compared with the size of the rest of him, waved about helplessly as he looked. +"What's happened to me?" he thought. It wasn't a dream. His room, a proper human room although a little too small, lay peacefully between its four familiar walls. +A collection of textile samples lay spread out on the table - Samsa was a travelling salesman - and above it there hung a picture that he had recently cut out of an illustrated magazine and housed in a nice, gilded frame. +It showed a lady fitted out with a fur hat and fur boa who sat upright, raising a heavy fur muff that covered the whole of her lower arm towards the viewer. +Gregor then turned to look out the window at the dull weather. +Drops of rain could be heard hitting the pane, which made him feel quite sad. "How about if I sleep a little bit longer and forget all this nonsense", he thought, but that was something he was unable to do because he was used to sleeping on his right, and in his present state couldn't get into that position. However hard he threw himself onto his right, he always rolled back to where he was. +He must have tried it a hundred times, shut his eyes so that he wouldn't have to look at the floundering legs, and only stopped when he began to feel a mild, dull pain there that he had never felt before. +"Oh, God", he thought, "what a strenuous career it is that I've +chosen! Travelling day in and day out. Doing business like this +takes much more effort than doing your own business at home, and on +top of that there's the curse of travelling, worries about making +train connections, bad and irregular food, contact with different +people all the time so that you can never get to know anyone or +become friendly with them. It can all go to Hell!" He felt a +slight itch up on his belly; pushed himself slowly up on his back +towards the headboard so that he could lift his head better; found +where the itch was, and saw that it was covered with lots of little +white spots which he didn't know what to make of; and when he tried +to feel the place with one of his legs he drew it quickly back +because as soon as he touched it he was overcome by a cold shudder. +He slid back into his former position. "Getting up early all the +time", he thought, "it makes you stupid. You've got to get enough +sleep. Other travelling salesmen live a life of luxury. For +instance, whenever I go back to the guest house during the morning +to copy out the contract, these gentlemen are always still sitting +there eating their breakfasts. I ought to just try that with my +boss; I'd get kicked out on the spot. But who knows, maybe that +would be the best thing for me. If I didn't have my parents to +think about I'd have given in my notice a long time ago, I'd have +gone up to the boss and told him just what I think, tell him +everything I would, let him know just what I feel. He'd fall right +off his desk! And it's a funny sort of business to be sitting up +there at your desk, talking down at your subordinates from up there, +especially when you have to go right up close because the boss is +hard of hearing. Well, there's still some hope; once I've got the +money together to pay off my parents' debt to him - another five or +six years I suppose - that's definitely what I'll do. That's when +I'll make the big change. First of all though, I've got to get up, +my train leaves at five." +And he looked over at the alarm clock, ticking on the chest of drawers. "God in Heaven!" he thought. It was half past six and the hands were quietly moving forwards, it was even later than half past, more like quarter to seven. Had the alarm clock not rung? He could see from the bed that it had been set for four o'clock as it should have been; it certainly must have rung. Yes, but was it +possible to quietly sleep through that furniture-rattling noise? True, he had not slept peacefully, but probably all the more deeply because of that. What should he do now? The next train went at seven; if he were to catch that he would have to rush like mad and +the collection of samples was still not packed, and he did not at all feel particularly fresh and lively. And even if he did catch the train he would not avoid his boss's anger as the office assistant would have been there to see the five o'clock train go, he +would have put in his report about Gregor's not being there a long time ago. The office assistant was the boss's man, spineless, and with no understanding. What about if he reported sick? But that would be extremely strained and suspicious as in fifteen years of +service Gregor had never once yet been ill. His boss would certainly come round with the doctor from the medical insurance company, accuse his parents of having a lazy son, and accept the doctor's recommendation not to make any claim as the doctor believed +that no-one was ever ill but that many were workshy. And what's more, would he have been entirely wrong in this case? Gregor did in fact, apart from excessive sleepiness after sleeping for so long, +feel completely well and even felt much hungrier than usual. Index: ql/src/test/results/clientpositive/udaf_ngrams.q.out =================================================================== --- ql/src/test/results/clientpositive/udaf_ngrams.q.out (revision 0) +++ ql/src/test/results/clientpositive/udaf_ngrams.q.out (revision 0) @@ -0,0 +1,60 @@ +PREHOOK: query: CREATE TABLE kafka (contents STRING) +PREHOOK: type: CREATETABLE +POSTHOOK: query: CREATE TABLE kafka (contents STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: default@kafka +PREHOOK: query: LOAD DATA LOCAL INPATH '../data/files/text-en.txt' INTO TABLE kafka +PREHOOK: type: LOAD +POSTHOOK: query: LOAD DATA LOCAL INPATH '../data/files/text-en.txt' INTO TABLE kafka +POSTHOOK: type: LOAD +POSTHOOK: Output: default@kafka +PREHOOK: query: SELECT ngrams(sentences(lower(contents)), 2, 100, 1000) FROM kafka +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_15-08-55_855_577317115029406892/10000 +POSTHOOK: query: SELECT ngrams(sentences(lower(contents)), 2, 100, 1000) FROM kafka +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_15-08-55_855_577317115029406892/10000 +[{"ngram":["that","he"],"estfrequency":9.0},{"ngram":["he","thought"],"estfrequency":9.0},{"ngram":["he","was"],"estfrequency":8.0},{"ngram":["on","his"],"estfrequency":8.0},{"ngram":["so","that"],"estfrequency":6.0},{"ngram":["a","little"],"estfrequency":6.0},{"ngram":["in","his"],"estfrequency":5.0},{"ngram":["on","the"],"estfrequency":5.0},{"ngram":["at","the"],"estfrequency":5.0},{"ngram":["he","had"],"estfrequency":5.0},{"ngram":["all","the"],"estfrequency":5.0},{"ngram":["if","he"],"estfrequency":5.0},{"ngram":["he","could"],"estfrequency":5.0},{"ngram":["have","to"],"estfrequency":5.0},{"ngram":["got","to"],"estfrequency":4.0},{"ngram":["if","i"],"estfrequency":4.0},{"ngram":["to","get"],"estfrequency":4.0},{"ngram":["just","what"],"estfrequency":4.0},{"ngram":["my","parents"],"estfrequency":4.0},{"ngram":["to","feel"],"estfrequency":4.0},{"ngram":["up","on"],"estfrequency":4.0},{"ngram":["as","he"],"estfrequency":4.0},{"ngram":["back","to"],"estfrequency":4.0},{"ngram":["to","look"],"estfrequency":4.0},{"ngram":["out","the"],"estfrequency":4.0},{"ngram":["to","the"],"estfrequency":4.0},{"ngram":["at","your"],"estfrequency":4.0},{"ngram":["i'd","have"],"estfrequency":4.0},{"ngram":["his","right"],"estfrequency":4.0},{"ngram":["the","boss"],"estfrequency":4.0},{"ngram":["out","on"],"estfrequency":4.0},{"ngram":["i've","got"],"estfrequency":4.0},{"ngram":["what","i"],"estfrequency":4.0},{"ngram":["it","was"],"estfrequency":4.0},{"ngram":["when","he"],"estfrequency":4.0},{"ngram":["but","that"],"estfrequency":4.0},{"ngram":["his","head"],"estfrequency":4.0},{"ngram":["towards","the"],"estfrequency":4.0},{"ngram":["about","if"],"estfrequency":3.0},{"ngram":["collection","of"],"estfrequency":3.0},{"ngram":["would","have"],"estfrequency":3.0},{"ngram":["had","never"],"estfrequency":3.0},{"ngram":["would","be"],"estfrequency":3.0},{"ngram":["must","have"],"estfrequency":3.0},{"ngram":["the","time"],"estfrequency":3.0},{"ngram":["long","time"],"estfrequency":3.0},{"ngram":["could","see"],"estfrequency":3.0},{"ngram":["with","the"],"estfrequency":3.0},{"ngram":["to","make"],"estfrequency":3.0},{"ngram":["of","that"],"estfrequency":3.0},{"ngram":["a","long"],"estfrequency":3.0},{"ngram":["that","it"],"estfrequency":3.0},{"ngram":["have","been"],"estfrequency":3.0},{"ngram":["he","looked"],"estfrequency":3.0},{"ngram":["time","ago"],"estfrequency":3.0},{"ngram":["up","there"],"estfrequency":2.0},{"ngram":["when","you"],"estfrequency":2.0},{"ngram":["unable","to"],"estfrequency":2.0},{"ngram":["worries","about"],"estfrequency":2.0},{"ngram":["salesmen","live"],"estfrequency":2.0},{"ngram":["brown","belly"],"estfrequency":2.0},{"ngram":["the","headboard"],"estfrequency":2.0},{"ngram":["too","small"],"estfrequency":2.0},{"ngram":["career","it"],"estfrequency":2.0},{"ngram":["into","his"],"estfrequency":2.0},{"ngram":["let","him"],"estfrequency":2.0},{"ngram":["are","always"],"estfrequency":2.0},{"ngram":["friendly","with"],"estfrequency":2.0},{"ngram":["heard","hitting"],"estfrequency":2.0},{"ngram":["him","just"],"estfrequency":2.0},{"ngram":["still","some"],"estfrequency":2.0},{"ngram":["travelling","salesman"],"estfrequency":2.0},{"ngram":["people","all"],"estfrequency":2.0},{"ngram":["hundred","times"],"estfrequency":2.0},{"ngram":["and","it's"],"estfrequency":2.0},{"ngram":["began","to"],"estfrequency":2.0},{"ngram":["boss","i'd"],"estfrequency":2.0},{"ngram":["what","i'll"],"estfrequency":2.0},{"ngram":["close","because"],"estfrequency":2.0},{"ngram":["gone","up"],"estfrequency":2.0},{"ngram":["thought","but"],"estfrequency":2.0},{"ngram":["a","picture"],"estfrequency":2.0},{"ngram":["samsa","woke"],"estfrequency":2.0},{"ngram":["be","heard"],"estfrequency":2.0},{"ngram":["his","eyes"],"estfrequency":2.0},{"ngram":["off","any"],"estfrequency":2.0},{"ngram":["i'll","make"],"estfrequency":2.0},{"ngram":["other","travelling"],"estfrequency":2.0},{"ngram":["lift","his"],"estfrequency":2.0},{"ngram":["belly","pushed"],"estfrequency":2.0},{"ngram":["quickly","back"],"estfrequency":2.0},{"ngram":["lay","on"],"estfrequency":2.0},{"ngram":["him","another"],"estfrequency":2.0},{"ngram":["but","who"],"estfrequency":2.0},{"ngram":["to","copy"],"estfrequency":2.0},{"ngram":["its","four"],"estfrequency":2.0},{"ngram":["about","i'd"],"estfrequency":2.0},{"ngram":["size","of"],"estfrequency":2.0},{"ngram":["get","to"],"estfrequency":2.0},{"ngram":["there's","the"],"estfrequency":2.0}] +PREHOOK: query: SELECT ngrams(sentences(lower(contents)), 1, 100, 1000) FROM kafka +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_15-09-00_807_3672038623494864629/10000 +POSTHOOK: query: SELECT ngrams(sentences(lower(contents)), 1, 100, 1000) FROM kafka +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_15-09-00_807_3672038623494864629/10000 +[{"ngram":["he"],"estfrequency":67.0},{"ngram":["the"],"estfrequency":67.0},{"ngram":["to"],"estfrequency":54.0},{"ngram":["a"],"estfrequency":44.0},{"ngram":["and"],"estfrequency":43.0},{"ngram":["his"],"estfrequency":36.0},{"ngram":["of"],"estfrequency":35.0},{"ngram":["that"],"estfrequency":33.0},{"ngram":["it"],"estfrequency":28.0},{"ngram":["was"],"estfrequency":24.0},{"ngram":["have"],"estfrequency":18.0},{"ngram":["i"],"estfrequency":16.0},{"ngram":["up"],"estfrequency":16.0},{"ngram":["with"],"estfrequency":16.0},{"ngram":["at"],"estfrequency":15.0},{"ngram":["on"],"estfrequency":15.0},{"ngram":["in"],"estfrequency":15.0},{"ngram":["out"],"estfrequency":14.0},{"ngram":["there"],"estfrequency":12.0},{"ngram":["what"],"estfrequency":12.0},{"ngram":["all"],"estfrequency":12.0},{"ngram":["him"],"estfrequency":12.0},{"ngram":["back"],"estfrequency":12.0},{"ngram":["would"],"estfrequency":11.0},{"ngram":["get"],"estfrequency":10.0},{"ngram":["as"],"estfrequency":10.0},{"ngram":["my"],"estfrequency":10.0},{"ngram":["when"],"estfrequency":10.0},{"ngram":["feel"],"estfrequency":10.0},{"ngram":["about"],"estfrequency":10.0},{"ngram":["thought"],"estfrequency":9.0},{"ngram":["if"],"estfrequency":9.0},{"ngram":["right"],"estfrequency":8.0},{"ngram":["had"],"estfrequency":8.0},{"ngram":["into"],"estfrequency":8.0},{"ngram":["little"],"estfrequency":8.0},{"ngram":["travelling"],"estfrequency":8.0},{"ngram":["but"],"estfrequency":8.0},{"ngram":["time"],"estfrequency":7.0},{"ngram":["could"],"estfrequency":7.0},{"ngram":["be"],"estfrequency":7.0},{"ngram":["from"],"estfrequency":7.0},{"ngram":["train"],"estfrequency":7.0},{"ngram":["go"],"estfrequency":7.0},{"ngram":["so"],"estfrequency":7.0},{"ngram":["boss"],"estfrequency":7.0},{"ngram":["not"],"estfrequency":7.0},{"ngram":["because"],"estfrequency":7.0},{"ngram":["off"],"estfrequency":6.0},{"ngram":["legs"],"estfrequency":6.0},{"ngram":["lay"],"estfrequency":6.0},{"ngram":["i'd"],"estfrequency":6.0},{"ngram":["business"],"estfrequency":6.0},{"ngram":["your"],"estfrequency":6.0},{"ngram":["know"],"estfrequency":6.0},{"ngram":["i've"],"estfrequency":6.0},{"ngram":["fur"],"estfrequency":6.0},{"ngram":["gregor"],"estfrequency":6.0},{"ngram":["got"],"estfrequency":6.0},{"ngram":["just"],"estfrequency":6.0},{"ngram":["you"],"estfrequency":6.0},{"ngram":["for"],"estfrequency":6.0},{"ngram":["himself"],"estfrequency":6.0},{"ngram":["sleep"],"estfrequency":5.0},{"ngram":["do"],"estfrequency":5.0},{"ngram":["still"],"estfrequency":5.0},{"ngram":["five"],"estfrequency":5.0},{"ngram":["never"],"estfrequency":5.0},{"ngram":["this"],"estfrequency":5.0},{"ngram":["more"],"estfrequency":5.0},{"ngram":["felt"],"estfrequency":5.0},{"ngram":["parents"],"estfrequency":5.0},{"ngram":["make"],"estfrequency":5.0},{"ngram":["been"],"estfrequency":5.0},{"ngram":["look"],"estfrequency":4.0},{"ngram":["morning"],"estfrequency":4.0},{"ngram":["doing"],"estfrequency":4.0},{"ngram":["long"],"estfrequency":4.0},{"ngram":["where"],"estfrequency":4.0},{"ngram":["itch"],"estfrequency":4.0},{"ngram":["room"],"estfrequency":4.0},{"ngram":["day"],"estfrequency":4.0},{"ngram":["by"],"estfrequency":4.0},{"ngram":["which"],"estfrequency":4.0},{"ngram":["i'll"],"estfrequency":4.0},{"ngram":["see"],"estfrequency":4.0},{"ngram":["samsa"],"estfrequency":4.0},{"ngram":["tried"],"estfrequency":4.0},{"ngram":["who"],"estfrequency":4.0},{"ngram":["sitting"],"estfrequency":4.0},{"ngram":["that's"],"estfrequency":4.0},{"ngram":["think"],"estfrequency":4.0},{"ngram":["me"],"estfrequency":4.0},{"ngram":["is"],"estfrequency":4.0},{"ngram":["found"],"estfrequency":4.0},{"ngram":["always"],"estfrequency":4.0},{"ngram":["desk"],"estfrequency":4.0},{"ngram":["belly"],"estfrequency":4.0},{"ngram":["dull"],"estfrequency":4.0},{"ngram":["than"],"estfrequency":4.0}] +PREHOOK: query: SELECT ngrams(sentences(lower(contents)), 3, 100, 1000) FROM kafka +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_15-09-05_661_5547440857175678543/10000 +POSTHOOK: query: SELECT ngrams(sentences(lower(contents)), 3, 100, 1000) FROM kafka +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_15-09-05_661_5547440857175678543/10000 +[{"ngram":["that","he","had"],"estfrequency":4.0},{"ngram":["got","to","get"],"estfrequency":4.0},{"ngram":["so","that","he"],"estfrequency":4.0},{"ngram":["just","what","i"],"estfrequency":4.0},{"ngram":["out","on","the"],"estfrequency":4.0},{"ngram":["up","on","his"],"estfrequency":4.0},{"ngram":["a","long","time"],"estfrequency":3.0},{"ngram":["long","time","ago"],"estfrequency":3.0},{"ngram":["he","could","see"],"estfrequency":3.0},{"ngram":["all","the","time"],"estfrequency":3.0},{"ngram":["dreams","he","found"],"estfrequency":2.0},{"ngram":["with","a","fur"],"estfrequency":2.0},{"ngram":["and","irregular","food"],"estfrequency":2.0},{"ngram":["lots","of","little"],"estfrequency":2.0},{"ngram":["right","and","in"],"estfrequency":2.0},{"ngram":["just","try","that"],"estfrequency":2.0},{"ngram":["time","ago","i'd"],"estfrequency":2.0},{"ngram":["of","rain","could"],"estfrequency":2.0},{"ngram":["shut","his","eyes"],"estfrequency":2.0},{"ngram":["little","he","could"],"estfrequency":2.0},{"ngram":["into","his","former"],"estfrequency":2.0},{"ngram":["showed","a","lady"],"estfrequency":2.0},{"ngram":["effort","than","doing"],"estfrequency":2.0},{"ngram":["stopped","when","he"],"estfrequency":2.0},{"ngram":["look","at","the"],"estfrequency":2.0},{"ngram":["a","nice","gilded"],"estfrequency":2.0},{"ngram":["business","at","home"],"estfrequency":2.0},{"ngram":["out","the","contract"],"estfrequency":2.0},{"ngram":["out","the","window"],"estfrequency":2.0},{"ngram":["day","in","and"],"estfrequency":2.0},{"ngram":["can","all","go"],"estfrequency":2.0},{"ngram":["with","one","of"],"estfrequency":2.0},{"ngram":["hard","he","threw"],"estfrequency":2.0},{"ngram":["his","brown","belly"],"estfrequency":2.0},{"ngram":["he","lifted","his"],"estfrequency":2.0},{"ngram":["many","legs","pitifully"],"estfrequency":2.0},{"ngram":["sat","upright","raising"],"estfrequency":2.0},{"ngram":["itch","up","on"],"estfrequency":2.0},{"ngram":["all","go","to"],"estfrequency":2.0},{"ngram":["i","sleep","a"],"estfrequency":2.0},{"ngram":["of","all","though"],"estfrequency":2.0},{"ngram":["to","be","sitting"],"estfrequency":2.0},{"ngram":["irregular","food","contact"],"estfrequency":2.0},{"ngram":["have","tried","it"],"estfrequency":2.0},{"ngram":["however","hard","he"],"estfrequency":2.0},{"ngram":["are","always","still"],"estfrequency":2.0},{"ngram":["had","recently","cut"],"estfrequency":2.0},{"ngram":["slowly","up","on"],"estfrequency":2.0},{"ngram":["how","about","if"],"estfrequency":2.0},{"ngram":["by","arches","into"],"estfrequency":2.0},{"ngram":["know","what","to"],"estfrequency":2.0},{"ngram":["a","fur","hat"],"estfrequency":2.0},{"ngram":["his","present","state"],"estfrequency":2.0},{"ngram":["career","it","is"],"estfrequency":2.0},{"ngram":["to","get","enough"],"estfrequency":2.0},{"ngram":["gregor","samsa","woke"],"estfrequency":2.0},{"ngram":["upright","raising","a"],"estfrequency":2.0},{"ngram":["god","he","thought"],"estfrequency":2.0},{"ngram":["to","pay","off"],"estfrequency":2.0},{"ngram":["it's","a","funny"],"estfrequency":2.0},{"ngram":["with","lots","of"],"estfrequency":2.0},{"ngram":["contact","with","different"],"estfrequency":2.0},{"ngram":["desk","talking","down"],"estfrequency":2.0},{"ngram":["hundred","times","shut"],"estfrequency":2.0},{"ngram":["close","because","the"],"estfrequency":2.0},{"ngram":["he","was","overcome"],"estfrequency":2.0},{"ngram":["when","he","tried"],"estfrequency":2.0},{"ngram":["what","i'll","do"],"estfrequency":2.0},{"ngram":["about","if","i"],"estfrequency":2.0},{"ngram":["go","to","hell"],"estfrequency":2.0},{"ngram":["cut","out","of"],"estfrequency":2.0},{"ngram":["lift","his","head"],"estfrequency":2.0},{"ngram":["knows","maybe","that"],"estfrequency":2.0},{"ngram":["instance","whenever","i"],"estfrequency":2.0},{"ngram":["all","though","i've"],"estfrequency":2.0},{"ngram":["one","morning","when"],"estfrequency":2.0},{"ngram":["makes","you","stupid"],"estfrequency":2.0},{"ngram":["magazine","and","housed"],"estfrequency":2.0},{"ngram":["house","during","the"],"estfrequency":2.0},{"ngram":["do","because","he"],"estfrequency":2.0},{"ngram":["arm","towards","the"],"estfrequency":2.0},{"ngram":["him","another","five"],"estfrequency":2.0},{"ngram":["on","the","table"],"estfrequency":2.0},{"ngram":["he","began","to"],"estfrequency":2.0},{"ngram":["because","the","boss"],"estfrequency":2.0},{"ngram":["my","parents","to"],"estfrequency":2.0},{"ngram":["hung","a","picture"],"estfrequency":2.0},{"ngram":["the","alarm","clock"],"estfrequency":2.0},{"ngram":["think","about","i'd"],"estfrequency":2.0},{"ngram":["he","wouldn't","have"],"estfrequency":2.0},{"ngram":["lower","arm","towards"],"estfrequency":2.0},{"ngram":["top","of","that"],"estfrequency":2.0},{"ngram":["into","a","horrible"],"estfrequency":2.0},{"ngram":["to","just","try"],"estfrequency":2.0},{"ngram":["turned","to","look"],"estfrequency":2.0},{"ngram":["out","with","a"],"estfrequency":2.0},{"ngram":["especially","when","you"],"estfrequency":2.0},{"ngram":["the","big","change"],"estfrequency":2.0},{"ngram":["he","always","rolled"],"estfrequency":2.0},{"ngram":["down","at","your"],"estfrequency":2.0}] +PREHOOK: query: SELECT ngrams(sentences(lower(contents)), 4, 100, 1000) FROM kafka +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_15-09-10_279_7710065688103871347/10000 +POSTHOOK: query: SELECT ngrams(sentences(lower(contents)), 4, 100, 1000) FROM kafka +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_15-09-10_279_7710065688103871347/10000 +[{"ngram":["a","long","time","ago"],"estfrequency":3.0},{"ngram":["where","the","itch","was"],"estfrequency":2.0},{"ngram":["pane","which","made","him"],"estfrequency":2.0},{"ngram":["had","never","felt","before"],"estfrequency":2.0},{"ngram":["business","to","be","sitting"],"estfrequency":2.0},{"ngram":["he","thought","but","that"],"estfrequency":2.0},{"ngram":["out","the","window","at"],"estfrequency":2.0},{"ngram":["lower","arm","towards","the"],"estfrequency":2.0},{"ngram":["try","that","with","my"],"estfrequency":2.0},{"ngram":["god","he","thought","what"],"estfrequency":2.0},{"ngram":["hitting","the","pane","which"],"estfrequency":2.0},{"ngram":["my","train","leaves","at"],"estfrequency":2.0},{"ngram":["and","it's","a","funny"],"estfrequency":2.0},{"ngram":["people","all","the","time"],"estfrequency":2.0},{"ngram":["present","state","couldn't","get"],"estfrequency":2.0},{"ngram":["hard","he","threw","himself"],"estfrequency":2.0},{"ngram":["could","lift","his","head"],"estfrequency":2.0},{"ngram":["he","touched","it","he"],"estfrequency":2.0},{"ngram":["your","own","business","at"],"estfrequency":2.0},{"ngram":["takes","much","more","effort"],"estfrequency":2.0},{"ngram":["to","copy","out","the"],"estfrequency":2.0},{"ngram":["on","his","belly","pushed"],"estfrequency":2.0},{"ngram":["look","at","the","floundering"],"estfrequency":2.0},{"ngram":["you've","got","to","get"],"estfrequency":2.0},{"ngram":["floundering","legs","and","only"],"estfrequency":2.0},{"ngram":["well","there's","still","some"],"estfrequency":2.0},{"ngram":["it","he","was","overcome"],"estfrequency":2.0},{"ngram":["his","armour-like","back","and"],"estfrequency":2.0},{"ngram":["was","a","travelling","salesman"],"estfrequency":2.0},{"ngram":["dreams","he","found","himself"],"estfrequency":2.0},{"ngram":["he","found","himself","transformed"],"estfrequency":2.0},{"ngram":["told","him","just","what"],"estfrequency":2.0},{"ngram":["your","subordinates","from","up"],"estfrequency":2.0},{"ngram":["notice","a","long","time"],"estfrequency":2.0},{"ngram":["whenever","i","go","back"],"estfrequency":2.0},{"ngram":["divided","by","arches","into"],"estfrequency":2.0},{"ngram":["drew","it","quickly","back"],"estfrequency":2.0},{"ngram":["magazine","and","housed","in"],"estfrequency":2.0},{"ngram":["parents","debt","to","him"],"estfrequency":2.0},{"ngram":["fur","hat","and","fur"],"estfrequency":2.0},{"ngram":["lift","his","head","better"],"estfrequency":2.0},{"ngram":["to","do","because","he"],"estfrequency":2.0},{"ngram":["one","morning","when","gregor"],"estfrequency":2.0},{"ngram":["then","turned","to","look"],"estfrequency":2.0},{"ngram":["hope","once","i've","got"],"estfrequency":2.0},{"ngram":["touched","it","he","was"],"estfrequency":2.0},{"ngram":["because","the","boss","is"],"estfrequency":2.0},{"ngram":["time","he","thought","it"],"estfrequency":2.0},{"ngram":["himself","transformed","in","his"],"estfrequency":2.0},{"ngram":["a","little","bit","longer"],"estfrequency":2.0},{"ngram":["itch","was","and","saw"],"estfrequency":2.0},{"ngram":["so","that","he","could"],"estfrequency":2.0},{"ngram":["i'd","get","kicked","out"],"estfrequency":2.0},{"ngram":["little","he","could","see"],"estfrequency":2.0},{"ngram":["have","to","look","at"],"estfrequency":2.0},{"ngram":["some","hope","once","i've"],"estfrequency":2.0},{"ngram":["armour-like","back","and","if"],"estfrequency":2.0},{"ngram":["used","to","sleeping","on"],"estfrequency":2.0},{"ngram":["pitifully","thin","compared","with"],"estfrequency":2.0},{"ngram":["i'll","make","the","big"],"estfrequency":2.0},{"ngram":["guest","house","during","the"],"estfrequency":2.0},{"ngram":["little","too","small","lay"],"estfrequency":2.0},{"ngram":["the","contract","these","gentlemen"],"estfrequency":2.0},{"ngram":["get","into","that","position"],"estfrequency":2.0},{"ngram":["out","with","a","fur"],"estfrequency":2.0},{"ngram":["and","when","he","tried"],"estfrequency":2.0},{"ngram":["up","close","because","the"],"estfrequency":2.0},{"ngram":["his","brown","belly","slightly"],"estfrequency":2.0},{"ngram":["sat","upright","raising","a"],"estfrequency":2.0},{"ngram":["on","his","armour-like","back"],"estfrequency":2.0},{"ngram":["about","i'd","have","given"],"estfrequency":2.0},{"ngram":["cover","it","and","seemed"],"estfrequency":2.0},{"ngram":["lay","peacefully","between","its"],"estfrequency":2.0},{"ngram":["him","just","what","i"],"estfrequency":2.0},{"ngram":["to","be","sitting","up"],"estfrequency":2.0},{"ngram":["and","seemed","ready","to"],"estfrequency":2.0},{"ngram":["rain","could","be","heard"],"estfrequency":2.0},{"ngram":["boss","and","told","him"],"estfrequency":2.0},{"ngram":["travelling","worries","about","making"],"estfrequency":2.0},{"ngram":["and","forget","all","this"],"estfrequency":2.0},{"ngram":["time","ago","i'd","have"],"estfrequency":2.0},{"ngram":["given","in","my","notice"],"estfrequency":2.0},{"ngram":["the","itch","was","and"],"estfrequency":2.0},{"ngram":["samsa","woke","from","troubled"],"estfrequency":2.0},{"ngram":["the","time","so","that"],"estfrequency":2.0},{"ngram":["something","he","was","unable"],"estfrequency":2.0},{"ngram":["to","just","try","that"],"estfrequency":2.0},{"ngram":["effort","than","doing","your"],"estfrequency":2.0},{"ngram":["shut","his","eyes","so"],"estfrequency":2.0},{"ngram":["up","on","his","belly"],"estfrequency":2.0},{"ngram":["a","picture","that","he"],"estfrequency":2.0},{"ngram":["was","overcome","by","a"],"estfrequency":2.0},{"ngram":["i've","got","to","get"],"estfrequency":2.0},{"ngram":["a","little","he","could"],"estfrequency":2.0},{"ngram":["years","i","suppose","that's"],"estfrequency":2.0},{"ngram":["he","must","have","tried"],"estfrequency":2.0},{"ngram":["be","heard","hitting","the"],"estfrequency":2.0},{"ngram":["lifted","his","head","a"],"estfrequency":2.0},{"ngram":["and","only","stopped","when"],"estfrequency":2.0},{"ngram":["covered","the","whole","of"],"estfrequency":2.0}] +PREHOOK: query: SELECT ngrams(sentences(lower(contents)), 5, 100, 1000) FROM kafka +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka +PREHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_15-09-15_154_7676586083527166994/10000 +POSTHOOK: query: SELECT ngrams(sentences(lower(contents)), 5, 100, 1000) FROM kafka +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka +POSTHOOK: Output: file:/var/folders/7i/7iCDbWRkGHOcgJgX0zscimPXXts/-Tmp-/mlahiri/hive_2010-07-26_15-09-15_154_7676586083527166994/10000 +[{"ngram":["to","cover","it","and","seemed"],"estfrequency":2.0},{"ngram":["train","connections","bad","and","irregular"],"estfrequency":2.0},{"ngram":["know","what","to","make","of"],"estfrequency":2.0},{"ngram":["with","the","size","of","the"],"estfrequency":2.0},{"ngram":["between","its","four","familiar","walls"],"estfrequency":2.0},{"ngram":["white","spots","which","he","didn't"],"estfrequency":2.0},{"ngram":["headboard","so","that","he","could"],"estfrequency":2.0},{"ngram":["a","little","bit","longer","and"],"estfrequency":2.0},{"ngram":["he","was","overcome","by","a"],"estfrequency":2.0},{"ngram":["talking","down","at","your","subordinates"],"estfrequency":2.0},{"ngram":["he","always","rolled","back","to"],"estfrequency":2.0},{"ngram":["a","fur","hat","and","fur"],"estfrequency":2.0},{"ngram":["of","textile","samples","lay","spread"],"estfrequency":2.0},{"ngram":["little","too","small","lay","peacefully"],"estfrequency":2.0},{"ngram":["heard","hitting","the","pane","which"],"estfrequency":2.0},{"ngram":["have","given","in","my","notice"],"estfrequency":2.0},{"ngram":["peacefully","between","its","four","familiar"],"estfrequency":2.0},{"ngram":["it","he","was","overcome","by"],"estfrequency":2.0},{"ngram":["heavy","fur","muff","that","covered"],"estfrequency":2.0},{"ngram":["all","though","i've","got","to"],"estfrequency":2.0},{"ngram":["but","who","knows","maybe","that"],"estfrequency":2.0},{"ngram":["his","brown","belly","slightly","domed"],"estfrequency":2.0},{"ngram":["god","he","thought","what","a"],"estfrequency":2.0},{"ngram":["irregular","food","contact","with","different"],"estfrequency":2.0},{"ngram":["because","he","was","used","to"],"estfrequency":2.0},{"ngram":["his","right","and","in","his"],"estfrequency":2.0},{"ngram":["have","to","look","at","the"],"estfrequency":2.0},{"ngram":["the","table","samsa","was","a"],"estfrequency":2.0},{"ngram":["if","he","lifted","his","head"],"estfrequency":2.0},{"ngram":["however","hard","he","threw","himself"],"estfrequency":2.0},{"ngram":["by","arches","into","stiff","sections"],"estfrequency":2.0},{"ngram":["of","travelling","worries","about","making"],"estfrequency":2.0},{"ngram":["money","together","to","pay","off"],"estfrequency":2.0},{"ngram":["belly","slightly","domed","and","divided"],"estfrequency":2.0},{"ngram":["longer","and","forget","all","this"],"estfrequency":2.0},{"ngram":["covered","the","whole","of","her"],"estfrequency":2.0},{"ngram":["well","there's","still","some","hope"],"estfrequency":2.0},{"ngram":["salesman","and","above","it","there"],"estfrequency":2.0},{"ngram":["i","would","let","him","know"],"estfrequency":2.0},{"ngram":["to","do","because","he","was"],"estfrequency":2.0},{"ngram":["made","him","feel","quite","sad"],"estfrequency":2.0},{"ngram":["forget","all","this","nonsense","he"],"estfrequency":2.0},{"ngram":["i","sleep","a","little","bit"],"estfrequency":2.0},{"ngram":["thought","it","makes","you","stupid"],"estfrequency":2.0},{"ngram":["the","contract","these","gentlemen","are"],"estfrequency":2.0},{"ngram":["from","troubled","dreams","he","found"],"estfrequency":2.0},{"ngram":["sleeping","on","his","right","and"],"estfrequency":2.0},{"ngram":["muff","that","covered","the","whole"],"estfrequency":2.0},{"ngram":["lay","peacefully","between","its","four"],"estfrequency":2.0},{"ngram":["desk","talking","down","at","your"],"estfrequency":2.0},{"ngram":["live","a","life","of","luxury"],"estfrequency":2.0},{"ngram":["in","a","nice","gilded","frame"],"estfrequency":2.0},{"ngram":["these","gentlemen","are","always","still"],"estfrequency":2.0},{"ngram":["getting","up","early","all","the"],"estfrequency":2.0},{"ngram":["a","strenuous","career","it","is"],"estfrequency":2.0},{"ngram":["housed","in","a","nice","gilded"],"estfrequency":2.0},{"ngram":["and","irregular","food","contact","with"],"estfrequency":2.0},{"ngram":["that","covered","the","whole","of"],"estfrequency":2.0},{"ngram":["there's","still","some","hope","once"],"estfrequency":2.0},{"ngram":["brown","belly","slightly","domed","and"],"estfrequency":2.0},{"ngram":["effort","than","doing","your","own"],"estfrequency":2.0},{"ngram":["his","belly","pushed","himself","slowly"],"estfrequency":2.0},{"ngram":["overcome","by","a","cold","shudder"],"estfrequency":2.0},{"ngram":["the","boss","and","told","him"],"estfrequency":2.0},{"ngram":["that","he","had","recently","cut"],"estfrequency":2.0},{"ngram":["he","thought","what","a","strenuous"],"estfrequency":2.0},{"ngram":["your","desk","talking","down","at"],"estfrequency":2.0},{"ngram":["so","that","he","could","lift"],"estfrequency":2.0},{"ngram":["i'd","get","kicked","out","on"],"estfrequency":2.0},{"ngram":["curse","of","travelling","worries","about"],"estfrequency":2.0},{"ngram":["bedding","was","hardly","able","to"],"estfrequency":2.0},{"ngram":["above","it","there","hung","a"],"estfrequency":2.0},{"ngram":["he","thought","it","makes","you"],"estfrequency":2.0},{"ngram":["of","all","though","i've","got"],"estfrequency":2.0},{"ngram":["all","this","nonsense","he","thought"],"estfrequency":2.0},{"ngram":["and","housed","in","a","nice"],"estfrequency":2.0},{"ngram":["magazine","and","housed","in","a"],"estfrequency":2.0},{"ngram":["human","room","although","a","little"],"estfrequency":2.0},{"ngram":["to","go","right","up","close"],"estfrequency":2.0},{"ngram":["more","effort","than","doing","your"],"estfrequency":2.0},{"ngram":["that","he","had","never","felt"],"estfrequency":2.0},{"ngram":["he","wouldn't","have","to","look"],"estfrequency":2.0},{"ngram":["the","size","of","the","rest"],"estfrequency":2.0},{"ngram":["have","tried","it","a","hundred"],"estfrequency":2.0},{"ngram":["he","had","never","felt","before"],"estfrequency":2.0},{"ngram":["eyes","so","that","he","wouldn't"],"estfrequency":2.0},{"ngram":["morning","when","gregor","samsa","woke"],"estfrequency":2.0},{"ngram":["little","bit","longer","and","forget"],"estfrequency":2.0},{"ngram":["room","a","proper","human","room"],"estfrequency":2.0},{"ngram":["back","to","the","guest","house"],"estfrequency":2.0},{"ngram":["i","didn't","have","my","parents"],"estfrequency":2.0},{"ngram":["could","lift","his","head","better"],"estfrequency":2.0},{"ngram":["i","go","back","to","the"],"estfrequency":2.0},{"ngram":["can","all","go","to","hell"],"estfrequency":2.0},{"ngram":["which","he","didn't","know","what"],"estfrequency":2.0},{"ngram":["sat","upright","raising","a","heavy"],"estfrequency":2.0},{"ngram":["because","as","soon","as","he"],"estfrequency":2.0},{"ngram":["back","and","if","he","lifted"],"estfrequency":2.0},{"ngram":["and","forget","all","this","nonsense"],"estfrequency":2.0},{"ngram":["what","i","think","tell","him"],"estfrequency":2.0}] +PREHOOK: query: DROP TABLE kafka +PREHOOK: type: DROPTABLE +POSTHOOK: query: DROP TABLE kafka +POSTHOOK: type: DROPTABLE +POSTHOOK: Output: default@kafka Index: ql/src/test/results/clientpositive/show_functions.q.out =================================================================== --- ql/src/test/results/clientpositive/show_functions.q.out (revision 979465) +++ ql/src/test/results/clientpositive/show_functions.q.out (working copy) @@ -85,6 +85,7 @@ minute month negative +ngrams not or parse_url Index: ql/src/test/queries/clientpositive/udaf_ngrams.q =================================================================== --- ql/src/test/queries/clientpositive/udaf_ngrams.q (revision 0) +++ ql/src/test/queries/clientpositive/udaf_ngrams.q (revision 0) @@ -0,0 +1,10 @@ +CREATE TABLE kafka (contents STRING); +LOAD DATA LOCAL INPATH '../data/files/text-en.txt' INTO TABLE kafka; + +SELECT ngrams(sentences(lower(contents)), 2, 100, 1000) FROM kafka; +SELECT ngrams(sentences(lower(contents)), 1, 100, 1000) FROM kafka; +SELECT ngrams(sentences(lower(contents)), 3, 100, 1000) FROM kafka; +SELECT ngrams(sentences(lower(contents)), 4, 100, 1000) FROM kafka; +SELECT ngrams(sentences(lower(contents)), 5, 100, 1000) FROM kafka; + +DROP TABLE kafka; Index: ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (revision 979465) +++ ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java (working copy) @@ -130,6 +130,7 @@ import org.apache.hadoop.hive.ql.udf.UDFUpper; import org.apache.hadoop.hive.ql.udf.UDFWeekOfYear; import org.apache.hadoop.hive.ql.udf.UDFYear; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFnGrams; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFAverage; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBridge; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount; @@ -367,8 +368,11 @@ registerGenericUDAF("histogram_numeric", new GenericUDAFHistogramNumeric()); registerGenericUDAF("percentile_approx", new GenericUDAFPercentileApprox()); + registerGenericUDAF("ngrams", new GenericUDAFnGrams()); + registerUDAF("percentile", UDAFPercentile.class); + // Generic UDFs registerGenericUDF("array", GenericUDFArray.class); registerGenericUDF("map", GenericUDFMap.class); Index: ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFnGrams.java =================================================================== --- ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFnGrams.java (revision 0) +++ ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFnGrams.java (revision 0) @@ -0,0 +1,462 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Set; +import java.util.Map; +import java.util.Collections; +import java.util.Comparator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.io.Text; + +/** + * Estimates the top-k n-grams in arbitrary sequential data using a heuristic. + */ +@Description(name = "ngrams", + value = "_FUNC_(expr, n, k, pf) - Estimates the top-k n-grams in rows that consist of " + + "sequences of strings, represented as arrays of strings, or arrays of arrays of " + + "strings. 'pf' is an optional precision factor that controls memory usage.", + extended = "The parameter 'n' specifies what type of n-grams are being estimated. Unigrams " + + "are n = 1, and bigrams are n = 2. Generally, n will not be greater than about 5. " + + "The 'k' parameter specifies how many of the highest-frequency n-grams will be " + + "returned by the UDAF. The optional precision factor 'pf' specifies how much " + + "memory to use for estimation; more memory will give more accurate frequency " + + "counts, but could crash the JVM. The default value is 20, which internally " + + "maintains 20*k n-grams, but only returns the k highest frequency ones. " + + "The output is an array of structs with the top-k n-grams. It might be convenient " + + "to explode() the output of this UDAF.") +public class GenericUDAFnGrams implements GenericUDAFResolver { + static final Log LOG = LogFactory.getLog(GenericUDAFnGrams.class.getName()); + + @Override + public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { + if (parameters.length != 3 && parameters.length != 4) { + throw new UDFArgumentTypeException(parameters.length-1, + "Please specify either three or four arguments."); + } + + // Validate the first parameter, which is the expression to compute over. This should be an + // array of strings type, or an array of arrays of strings. + PrimitiveTypeInfo pti; + if (parameters[0].getCategory() != ObjectInspector.Category.LIST) { + throw new UDFArgumentTypeException(0, + "Only list type arguments are accepted but " + + parameters[0].getTypeName() + " was passed as parameter 1."); + } + switch (((ListTypeInfo) parameters[0]).getListElementTypeInfo().getCategory()) { + case PRIMITIVE: + // Parameter 1 was an array of primitives, so make sure the primitives are strings. + pti = (PrimitiveTypeInfo) ((ListTypeInfo) parameters[0]).getListElementTypeInfo(); + break; + + case LIST: + // Parameter 1 was an array of arrays, so make sure that the inner arrays contain + // primitive strings. + ListTypeInfo lti = (ListTypeInfo) + ((ListTypeInfo) parameters[0]).getListElementTypeInfo(); + pti = (PrimitiveTypeInfo) lti.getListElementTypeInfo(); + break; + + default: + throw new UDFArgumentTypeException(0, + "Only arrays of strings or arrays of arrays of strings are accepted but " + + parameters[0].getTypeName() + " was passed as parameter 1."); + } + if(pti.getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) { + throw new UDFArgumentTypeException(0, + "Only array or array> is allowed, but " + + parameters[0].getTypeName() + " was passed as parameter 1."); + } + + // Validate the second parameter, which should be an integer + if(parameters[1].getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new UDFArgumentTypeException(1, "Only integers are accepted but " + + parameters[1].getTypeName() + " was passed as parameter 2."); + } + switch(((PrimitiveTypeInfo) parameters[1]).getPrimitiveCategory()) { + case BYTE: + case SHORT: + case INT: + case LONG: + break; + + default: + throw new UDFArgumentTypeException(1, "Only integers are accepted but " + + parameters[1].getTypeName() + " was passed as parameter 2."); + } + + // Validate the third parameter, which should also be an integer + if(parameters[2].getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new UDFArgumentTypeException(2, "Only integers are accepted but " + + parameters[2].getTypeName() + " was passed as parameter 3."); + } + switch(((PrimitiveTypeInfo) parameters[2]).getPrimitiveCategory()) { + case BYTE: + case SHORT: + case INT: + case LONG: + break; + + default: + throw new UDFArgumentTypeException(2, "Only integers are accepted but " + + parameters[2].getTypeName() + " was passed as parameter 3."); + } + + // If we have the optional fourth parameter, make sure it's also an integer + if(parameters.length == 4) { + if(parameters[3].getCategory() != ObjectInspector.Category.PRIMITIVE) { + throw new UDFArgumentTypeException(3, "Only integers are accepted but " + + parameters[3].getTypeName() + " was passed as parameter 4."); + } + switch(((PrimitiveTypeInfo) parameters[3]).getPrimitiveCategory()) { + case BYTE: + case SHORT: + case INT: + case LONG: + break; + + default: + throw new UDFArgumentTypeException(3, "Only integers are accepted but " + + parameters[3].getTypeName() + " was passed as parameter 4."); + } + } + + return new GenericUDAFnGramEvaluator(); + } + + /** + * A constant-space heuristic to estimate the top-k n-grams. + */ + public static class GenericUDAFnGramEvaluator extends GenericUDAFEvaluator { + // For PARTIAL1 and COMPLETE: ObjectInspectors for original data + private StandardListObjectInspector outerInputOI; + private StandardListObjectInspector innerInputOI; + private PrimitiveObjectInspector inputOI; + private PrimitiveObjectInspector nOI; + private PrimitiveObjectInspector kOI; + private PrimitiveObjectInspector pOI; + + // For PARTIAL2 and FINAL: ObjectInspectors for partial aggregations + private StandardListObjectInspector loi; + + @Override + public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { + super.init(m, parameters); + + // Init input object inspectors + if (m == Mode.PARTIAL1 || m == Mode.COMPLETE) { + outerInputOI = (StandardListObjectInspector) parameters[0]; + if(outerInputOI.getListElementObjectInspector().getCategory() == + ObjectInspector.Category.LIST) { + // We're dealing with input that is an array of arrays of strings + innerInputOI = (StandardListObjectInspector) outerInputOI.getListElementObjectInspector(); + inputOI = (PrimitiveObjectInspector) innerInputOI.getListElementObjectInspector(); + } else { + // We're dealing with input that is an array of strings + inputOI = (PrimitiveObjectInspector) outerInputOI.getListElementObjectInspector(); + innerInputOI = null; + } + nOI = (PrimitiveObjectInspector) parameters[1]; + kOI = (PrimitiveObjectInspector) parameters[2]; + if(parameters.length == 4) { + pOI = (PrimitiveObjectInspector) parameters[3]; + } else { + pOI = null; + } + } else { + // Init the list object inspector for handling partial aggregations + loi = (StandardListObjectInspector) parameters[0]; + } + + // Init output object inspectors. + // + // The return type for a partial aggregation is still a list of strings. + // + // The return type for FINAL and COMPLETE is a full aggregation result, which is + // an array of structures containing the n-gram and its estimated frequency. + if (m == Mode.PARTIAL1 || m == Mode.PARTIAL2) { + return ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector); + } else { + // Final return type that goes back to Hive: a list of structs with n-grams and their + // estimated frequencies. + ArrayList foi = new ArrayList(); + foi.add(ObjectInspectorFactory.getStandardListObjectInspector( + PrimitiveObjectInspectorFactory.writableStringObjectInspector)); + foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); + ArrayList fname = new ArrayList(); + fname.add("ngram"); + fname.add("estfrequency"); + return ObjectInspectorFactory.getStandardListObjectInspector( + ObjectInspectorFactory.getStandardStructObjectInspector(fname, foi) ); + } + } + + @Override + public void merge(AggregationBuffer agg, Object partial) throws HiveException { + if(partial == null) { + return; + } + NGramAggBuf myagg = (NGramAggBuf) agg; + + ArrayList partialNGrams = (ArrayList) loi.getList(partial); + int k = Integer.parseInt(((Text)partialNGrams.get(0)).toString()); + int n = Integer.parseInt(((Text)partialNGrams.get(1)).toString()); + int pf = Integer.parseInt(((Text)partialNGrams.get(2)).toString()); + if(myagg.k > 0 && myagg.k != k) { + throw new HiveException(getClass().getSimpleName() + ": mismatch in value for 'k'" + + ", which usually is caused by a non-constant expression. Found '"+k+"' and '" + + myagg.k + "'."); + } + if(myagg.n > 0 && myagg.n != n) { + throw new HiveException(getClass().getSimpleName() + ": mismatch in value for 'n'" + + ", which usually is caused by a non-constant expression. Found '"+n+"' and '" + + myagg.n + "'."); + } + if(myagg.pf > 0 && myagg.pf != pf) { + throw new HiveException(getClass().getSimpleName() + ": mismatch in value for 'pf'" + + ", which usually is caused by a non-constant expression. Found '"+pf+"' and '" + + myagg.pf + "'."); + } + myagg.k = k; + myagg.n = n; + myagg.pf = pf; + + for(int i = 3; i < partialNGrams.size(); i++) { + ArrayList key = new ArrayList(); + for(int j = 0; j < n; j++) { + key.add(((Text)partialNGrams.get(i+j)).toString()); + } + i += n; + double val = Double.parseDouble( ((Text)partialNGrams.get(i)).toString() ); + Double myval = (Double)myagg.ngrams.get(key); + if(myval == null) { + myval = new Double(val); + } else { + myval += val; + } + myagg.ngrams.put(key, myval); + } + trim(myagg, myagg.k*myagg.pf); + } + + @Override + public Object terminatePartial(AggregationBuffer agg) throws HiveException { + NGramAggBuf myagg = (NGramAggBuf) agg; + + ArrayList result = new ArrayList(); + result.add(new Text(Integer.toString(myagg.k))); + result.add(new Text(Integer.toString(myagg.n))); + result.add(new Text(Integer.toString(myagg.pf))); + for(Iterator > it = myagg.ngrams.keySet().iterator(); it.hasNext(); ) { + ArrayList mykey = it.next(); + for(int i = 0; i < mykey.size(); i++) { + result.add(new Text(mykey.get(i))); + } + Double myval = (Double) myagg.ngrams.get(mykey); + result.add(new Text(myval.toString())); + } + + return result; + } + + private void trim(NGramAggBuf agg, int N) { + ArrayList list = new ArrayList(agg.ngrams.entrySet()); + if(list.size() <= N) { + return; + } + Collections.sort(list, new Comparator() { + public int compare(Object o1, Object o2) { + return ((Double)((Map.Entry)o1).getValue()).compareTo( + ((Double)((Map.Entry)o2).getValue()) ); + } + }); + for(int i = 0; i < list.size() - N; i++) { + agg.ngrams.remove( ((Map.Entry)list.get(i)).getKey() ); + } + } + + private void processNgrams(NGramAggBuf agg, ArrayList seq) { + for(int i = seq.size()-agg.n; i >= 0; i--) { + ArrayList ngram = new ArrayList(); + for(int j = 0; j < agg.n; j++) { + ngram.add(seq.get(i+j)); + } + Double curVal = (Double) agg.ngrams.get(ngram); + if(curVal == null) { + // new n-gram + curVal = new Double(1); + } else { + // existing n-gram, just increment count + curVal++; + } + agg.ngrams.put(ngram, curVal); + } + + // do we have too many ngrams? + if(agg.ngrams.size() > agg.k * agg.pf) { + // delete low-support n-grams + trim(agg, agg.k * agg.pf); + } + } + + @Override + public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { + assert (parameters.length == 3); + if(parameters[0] == null || parameters[1] == null || parameters[2] == null) { + return; + } + NGramAggBuf myagg = (NGramAggBuf) agg; + + // Parse out 'n' and 'k' if we haven't already done so, and while we're at it, + // also parse out the precision factor 'pf' if the user has supplied one. + if(myagg.n == 0 || myagg.k == 0) { + myagg.n = PrimitiveObjectInspectorUtils.getInt(parameters[1], nOI); + myagg.k = PrimitiveObjectInspectorUtils.getInt(parameters[2], kOI); + if(myagg.n < 1) { + throw new HiveException(getClass().getSimpleName() + " needs 'n' to be at least 1, " + + "but you supplied " + myagg.n); + } + if(myagg.k < 1) { + throw new HiveException(getClass().getSimpleName() + " needs 'k' to be at least 1, " + + "but you supplied " + myagg.k); + } + if(parameters.length == 4) { + myagg.pf = PrimitiveObjectInspectorUtils.getInt(parameters[3], pOI); + if(myagg.pf < 1) { + throw new HiveException(getClass().getSimpleName() + " needs 'pf' to be at least 1, " + + "but you supplied " + myagg.pf); + } + } + + // Enforce a minimum n-gram buffer size + if(myagg.pf*myagg.k < 1000) { + myagg.pf = 1000 / myagg.k; + } + } + + // get the input expression + ArrayList outer = (ArrayList) outerInputOI.getList(parameters[0]); + if(innerInputOI != null) { + // we're dealing with an array of arrays of strings + for(int i = 0; i < outer.size(); i++) { + ArrayList inner = (ArrayList) innerInputOI.getList(outer.get(i)); + ArrayList words = new ArrayList(); + for(int j = 0; j < inner.size(); j++) { + String word = PrimitiveObjectInspectorUtils.getString(inner.get(j), inputOI); + words.add(word); + } + + // parse out n-grams, update frequency counts + processNgrams(myagg, words); + } + } else { + // we're dealing with an array of strings + ArrayList words = new ArrayList(); + for(int i = 0; i < outer.size(); i++) { + String word = PrimitiveObjectInspectorUtils.getString(outer.get(i), inputOI); + words.add(word); + } + + // parse out n-grams, update frequency counts + processNgrams(myagg, words); + } + } + + @Override + public Object terminate(AggregationBuffer agg) throws HiveException { + NGramAggBuf myagg = (NGramAggBuf) agg; + if (myagg.ngrams.size() < 1) { // SQL standard - return null for zero elements + return null; + } + + ArrayList result = new ArrayList(); + + ArrayList list = new ArrayList(myagg.ngrams.entrySet()); + Collections.sort(list, new Comparator() { + public int compare(Object o1, Object o2) { + return ((Double)((Map.Entry)o2).getValue()).compareTo( + ((Double)((Map.Entry)o1).getValue()) ); + } + }); + + for(int i = 0; i < list.size() && i < myagg.k; i++) { + ArrayList key = (ArrayList)((Map.Entry)list.get(i)).getKey(); + Double val = (Double)((Map.Entry)list.get(i)).getValue(); + + Object[] ngram = new Object[2]; + ngram[0] = new ArrayList(); + for(int j = 0; j < key.size(); j++) { + ((ArrayList)ngram[0]).add(new Text(key.get(j))); + } + ngram[1] = new DoubleWritable(val.doubleValue()); + result.add(ngram); + } + + return result; + } + + + // Aggregation buffer methods. + static class NGramAggBuf implements AggregationBuffer { + HashMap ngrams; + int n; + int k; + int pf; + }; + + @Override + public AggregationBuffer getNewAggregationBuffer() throws HiveException { + NGramAggBuf result = new NGramAggBuf(); + reset(result); + return result; + } + + @Override + public void reset(AggregationBuffer agg) throws HiveException { + NGramAggBuf result = (NGramAggBuf) agg; + result.ngrams = new HashMap(); + result.n = result.k = result.pf = 0; + } + } +}