diff --git itests/qtest-druid/pom.xml itests/qtest-druid/pom.xml index 79a0fb3f2f..e566fcf4d7 100644 --- itests/qtest-druid/pom.xml +++ itests/qtest-druid/pom.xml @@ -43,7 +43,7 @@ 10.11.1.1 16.0.1 4.1.0 - 0.10.2.0 + 1.0.1 diff --git itests/qtest-druid/src/main/java/org/apache/hive/kafka/SingleNodeKafkaCluster.java itests/qtest-druid/src/main/java/org/apache/hive/kafka/SingleNodeKafkaCluster.java index d839fd2db4..c9339b565e 100644 --- itests/qtest-druid/src/main/java/org/apache/hive/kafka/SingleNodeKafkaCluster.java +++ itests/qtest-druid/src/main/java/org/apache/hive/kafka/SingleNodeKafkaCluster.java @@ -10,6 +10,7 @@ import org.apache.hadoop.service.AbstractService; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.ByteArraySerializer; import org.apache.kafka.common.serialization.StringSerializer; import com.google.common.base.Throwables; @@ -25,6 +26,7 @@ import java.nio.charset.Charset; import java.util.List; import java.util.Properties; +import java.util.stream.IntStream; /** * This class has the hooks to start and stop single node kafka cluster. @@ -32,6 +34,10 @@ */ public class SingleNodeKafkaCluster extends AbstractService { private static final Logger log = LoggerFactory.getLogger(SingleNodeKafkaCluster.class); + private static final int BROKER_PORT = 9092; + private static final String LOCALHOST = "localhost"; + private static final String LOCALHOST_9092 = String.format("%s:%s", LOCALHOST, BROKER_PORT); + private final KafkaServerStartable serverStartable; private final String zkString; @@ -42,9 +48,12 @@ public SingleNodeKafkaCluster(String name, String logDir, Integer zkPort){ this.zkString = String.format("localhost:%d", zkPort); properties.setProperty("zookeeper.connect", zkString); properties.setProperty("broker.id", String.valueOf(1)); - properties.setProperty("host.name", "localhost"); - properties.setProperty("port", Integer.toString(9092)); + properties.setProperty("host.name", LOCALHOST); + properties.setProperty("port", Integer.toString(BROKER_PORT)); properties.setProperty("log.dir", logDir); + // This property is very important, we are sending form records with a specific time + // Thus need to make sure that they don't get DELETED + properties.setProperty("log.retention.hours", String.valueOf(Integer.MAX_VALUE)); properties.setProperty("log.flush.interval.messages", String.valueOf(1)); properties.setProperty("offsets.topic.replication.factor", String.valueOf(1)); properties.setProperty("offsets.topic.num.partitions", String.valueOf(1)); @@ -80,7 +89,7 @@ public void createTopicWithData(String topicName, File datafile){ createTopic(topicName); // set up kafka producer Properties properties = new Properties(); - properties.put("bootstrap.servers", "localhost:9092"); + properties.put("bootstrap.servers", LOCALHOST_9092); properties.put("acks", "1"); properties.put("retries", "3"); @@ -91,12 +100,36 @@ public void createTopicWithData(String topicName, File datafile){ )){ List events = Files.readLines(datafile, Charset.forName("UTF-8")); for(String event : events){ - producer.send(new ProducerRecord(topicName, event)); + producer.send(new ProducerRecord<>(topicName, event)); } } catch (IOException e) { Throwables.propagate(e); } + } + + public void createTopicWithData(String topic, List events) { + createTopic(topic); + // set up kafka producer + Properties properties = new Properties(); + properties.put("bootstrap.servers", LOCALHOST_9092); + properties.put("acks", "1"); + properties.put("retries", "3"); + try(KafkaProducer producer = new KafkaProducer<>( + properties, + new ByteArraySerializer(), + new ByteArraySerializer() + )){ + // 1534736225090 -> 08/19/2018 20:37:05 + IntStream.range(0, events.size()) + .mapToObj(i -> new ProducerRecord<>(topic, + 0, + // 1534736225090 -> Mon Aug 20 2018 03:37:05 + 1534736225090L + 1000 * 3600 * i, + ("key-" + i).getBytes(), + events.get(i))) + .forEach(r -> producer.send(r)); + } } public void createTopic(String topic) { diff --git itests/qtest-druid/src/main/java/org/apache/hive/kafka/Wikipedia.java itests/qtest-druid/src/main/java/org/apache/hive/kafka/Wikipedia.java new file mode 100644 index 0000000000..2fb180b33a --- /dev/null +++ itests/qtest-druid/src/main/java/org/apache/hive/kafka/Wikipedia.java @@ -0,0 +1,1418 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.hive.kafka; + +import org.apache.avro.specific.SpecificData; +import org.apache.avro.message.BinaryMessageEncoder; +import org.apache.avro.message.BinaryMessageDecoder; +import org.apache.avro.message.SchemaStore; + +@SuppressWarnings("all") +@org.apache.avro.specific.AvroGenerated +public class Wikipedia extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { + private static final long serialVersionUID = 960374719287820723L; + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Wikipedia\",\"namespace\":\"org.apache.hive.kafka\",\"fields\":[{\"name\":\"isrobot\",\"type\":\"boolean\"},{\"name\":\"channel\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"timestamp\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"flags\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"isunpatrolled\",\"type\":\"boolean\"},{\"name\":\"page\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"diffurl\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"added\",\"type\":\"long\"},{\"name\":\"comment\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"commentlength\",\"type\":\"long\"},{\"name\":\"isnew\",\"type\":\"boolean\"},{\"name\":\"isminor\",\"type\":\"boolean\"},{\"name\":\"delta\",\"type\":\"long\"},{\"name\":\"isanonymous\",\"type\":\"boolean\"},{\"name\":\"user\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}},{\"name\":\"deltabucket\",\"type\":\"double\"},{\"name\":\"deleted\",\"type\":\"long\"},{\"name\":\"namespace\",\"type\":{\"type\":\"string\",\"avro.java.string\":\"String\"}}],\"version\":\"1\"}"); + public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } + + private static SpecificData MODEL$ = new SpecificData(); + + private static final BinaryMessageEncoder ENCODER = + new BinaryMessageEncoder(MODEL$, SCHEMA$); + + private static final BinaryMessageDecoder DECODER = + new BinaryMessageDecoder(MODEL$, SCHEMA$); + + /** + * Return the BinaryMessageDecoder instance used by this class. + */ + public static BinaryMessageDecoder getDecoder() { + return DECODER; + } + + /** + * Create a new BinaryMessageDecoder instance for this class that uses the specified {@link SchemaStore}. + * @param resolver a {@link SchemaStore} used to find schemas by fingerprint + */ + public static BinaryMessageDecoder createDecoder(SchemaStore resolver) { + return new BinaryMessageDecoder(MODEL$, SCHEMA$, resolver); + } + + /** Serializes this Wikipedia to a ByteBuffer. */ + public java.nio.ByteBuffer toByteBuffer() throws java.io.IOException { + return ENCODER.encode(this); + } + + /** Deserializes a Wikipedia from a ByteBuffer. */ + public static Wikipedia fromByteBuffer( + java.nio.ByteBuffer b) throws java.io.IOException { + return DECODER.decode(b); + } + + @Deprecated public boolean isrobot; + @Deprecated public java.lang.String channel; + @Deprecated public java.lang.String timestamp; + @Deprecated public java.lang.String flags; + @Deprecated public boolean isunpatrolled; + @Deprecated public java.lang.String page; + @Deprecated public java.lang.String diffurl; + @Deprecated public long added; + @Deprecated public java.lang.String comment; + @Deprecated public long commentlength; + @Deprecated public boolean isnew; + @Deprecated public boolean isminor; + @Deprecated public long delta; + @Deprecated public boolean isanonymous; + @Deprecated public java.lang.String user; + @Deprecated public double deltabucket; + @Deprecated public long deleted; + @Deprecated public java.lang.String namespace; + + /** + * Default constructor. Note that this does not initialize fields + * to their default values from the schema. If that is desired then + * one should use newBuilder(). + */ + public Wikipedia() {} + + /** + * All-args constructor. + * @param isrobot The new value for isrobot + * @param channel The new value for channel + * @param timestamp The new value for timestamp + * @param flags The new value for flags + * @param isunpatrolled The new value for isunpatrolled + * @param page The new value for page + * @param diffurl The new value for diffurl + * @param added The new value for added + * @param comment The new value for comment + * @param commentlength The new value for commentlength + * @param isnew The new value for isnew + * @param isminor The new value for isminor + * @param delta The new value for delta + * @param isanonymous The new value for isanonymous + * @param user The new value for user + * @param deltabucket The new value for deltabucket + * @param deleted The new value for deleted + * @param namespace The new value for namespace + */ + public Wikipedia(java.lang.Boolean isrobot, java.lang.String channel, java.lang.String timestamp, java.lang.String flags, java.lang.Boolean isunpatrolled, java.lang.String page, java.lang.String diffurl, java.lang.Long added, java.lang.String comment, java.lang.Long commentlength, java.lang.Boolean isnew, java.lang.Boolean isminor, java.lang.Long delta, java.lang.Boolean isanonymous, java.lang.String user, java.lang.Double deltabucket, java.lang.Long deleted, java.lang.String namespace) { + this.isrobot = isrobot; + this.channel = channel; + this.timestamp = timestamp; + this.flags = flags; + this.isunpatrolled = isunpatrolled; + this.page = page; + this.diffurl = diffurl; + this.added = added; + this.comment = comment; + this.commentlength = commentlength; + this.isnew = isnew; + this.isminor = isminor; + this.delta = delta; + this.isanonymous = isanonymous; + this.user = user; + this.deltabucket = deltabucket; + this.deleted = deleted; + this.namespace = namespace; + } + + public org.apache.avro.Schema getSchema() { return SCHEMA$; } + // Used by DatumWriter. Applications should not call. + public java.lang.Object get(int field$) { + switch (field$) { + case 0: return isrobot; + case 1: return channel; + case 2: return timestamp; + case 3: return flags; + case 4: return isunpatrolled; + case 5: return page; + case 6: return diffurl; + case 7: return added; + case 8: return comment; + case 9: return commentlength; + case 10: return isnew; + case 11: return isminor; + case 12: return delta; + case 13: return isanonymous; + case 14: return user; + case 15: return deltabucket; + case 16: return deleted; + case 17: return namespace; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + + // Used by DatumReader. Applications should not call. + @SuppressWarnings(value="unchecked") + public void put(int field$, java.lang.Object value$) { + switch (field$) { + case 0: isrobot = (java.lang.Boolean)value$; break; + case 1: channel = (java.lang.String)value$.toString(); break; + case 2: timestamp = (java.lang.String)value$.toString(); break; + case 3: flags = (java.lang.String)value$.toString(); break; + case 4: isunpatrolled = (java.lang.Boolean)value$; break; + case 5: page = (java.lang.String)value$.toString(); break; + case 6: diffurl = (java.lang.String)value$.toString(); break; + case 7: added = (java.lang.Long)value$; break; + case 8: comment = (java.lang.String)value$.toString(); break; + case 9: commentlength = (java.lang.Long)value$; break; + case 10: isnew = (java.lang.Boolean)value$; break; + case 11: isminor = (java.lang.Boolean)value$; break; + case 12: delta = (java.lang.Long)value$; break; + case 13: isanonymous = (java.lang.Boolean)value$; break; + case 14: user = (java.lang.String)value$.toString(); break; + case 15: deltabucket = (java.lang.Double)value$; break; + case 16: deleted = (java.lang.Long)value$; break; + case 17: namespace = (java.lang.String)value$.toString(); break; + default: throw new org.apache.avro.AvroRuntimeException("Bad index"); + } + } + + /** + * Gets the value of the 'isrobot' field. + * @return The value of the 'isrobot' field. + */ + public java.lang.Boolean getIsrobot() { + return isrobot; + } + + /** + * Sets the value of the 'isrobot' field. + * @param value the value to set. + */ + public void setIsrobot(java.lang.Boolean value) { + this.isrobot = value; + } + + /** + * Gets the value of the 'channel' field. + * @return The value of the 'channel' field. + */ + public java.lang.String getChannel() { + return channel; + } + + /** + * Sets the value of the 'channel' field. + * @param value the value to set. + */ + public void setChannel(java.lang.String value) { + this.channel = value; + } + + /** + * Gets the value of the 'timestamp' field. + * @return The value of the 'timestamp' field. + */ + public java.lang.String getTimestamp() { + return timestamp; + } + + /** + * Sets the value of the 'timestamp' field. + * @param value the value to set. + */ + public void setTimestamp(java.lang.String value) { + this.timestamp = value; + } + + /** + * Gets the value of the 'flags' field. + * @return The value of the 'flags' field. + */ + public java.lang.String getFlags() { + return flags; + } + + /** + * Sets the value of the 'flags' field. + * @param value the value to set. + */ + public void setFlags(java.lang.String value) { + this.flags = value; + } + + /** + * Gets the value of the 'isunpatrolled' field. + * @return The value of the 'isunpatrolled' field. + */ + public java.lang.Boolean getIsunpatrolled() { + return isunpatrolled; + } + + /** + * Sets the value of the 'isunpatrolled' field. + * @param value the value to set. + */ + public void setIsunpatrolled(java.lang.Boolean value) { + this.isunpatrolled = value; + } + + /** + * Gets the value of the 'page' field. + * @return The value of the 'page' field. + */ + public java.lang.String getPage() { + return page; + } + + /** + * Sets the value of the 'page' field. + * @param value the value to set. + */ + public void setPage(java.lang.String value) { + this.page = value; + } + + /** + * Gets the value of the 'diffurl' field. + * @return The value of the 'diffurl' field. + */ + public java.lang.String getDiffurl() { + return diffurl; + } + + /** + * Sets the value of the 'diffurl' field. + * @param value the value to set. + */ + public void setDiffurl(java.lang.String value) { + this.diffurl = value; + } + + /** + * Gets the value of the 'added' field. + * @return The value of the 'added' field. + */ + public java.lang.Long getAdded() { + return added; + } + + /** + * Sets the value of the 'added' field. + * @param value the value to set. + */ + public void setAdded(java.lang.Long value) { + this.added = value; + } + + /** + * Gets the value of the 'comment' field. + * @return The value of the 'comment' field. + */ + public java.lang.String getComment() { + return comment; + } + + /** + * Sets the value of the 'comment' field. + * @param value the value to set. + */ + public void setComment(java.lang.String value) { + this.comment = value; + } + + /** + * Gets the value of the 'commentlength' field. + * @return The value of the 'commentlength' field. + */ + public java.lang.Long getCommentlength() { + return commentlength; + } + + /** + * Sets the value of the 'commentlength' field. + * @param value the value to set. + */ + public void setCommentlength(java.lang.Long value) { + this.commentlength = value; + } + + /** + * Gets the value of the 'isnew' field. + * @return The value of the 'isnew' field. + */ + public java.lang.Boolean getIsnew() { + return isnew; + } + + /** + * Sets the value of the 'isnew' field. + * @param value the value to set. + */ + public void setIsnew(java.lang.Boolean value) { + this.isnew = value; + } + + /** + * Gets the value of the 'isminor' field. + * @return The value of the 'isminor' field. + */ + public java.lang.Boolean getIsminor() { + return isminor; + } + + /** + * Sets the value of the 'isminor' field. + * @param value the value to set. + */ + public void setIsminor(java.lang.Boolean value) { + this.isminor = value; + } + + /** + * Gets the value of the 'delta' field. + * @return The value of the 'delta' field. + */ + public java.lang.Long getDelta() { + return delta; + } + + /** + * Sets the value of the 'delta' field. + * @param value the value to set. + */ + public void setDelta(java.lang.Long value) { + this.delta = value; + } + + /** + * Gets the value of the 'isanonymous' field. + * @return The value of the 'isanonymous' field. + */ + public java.lang.Boolean getIsanonymous() { + return isanonymous; + } + + /** + * Sets the value of the 'isanonymous' field. + * @param value the value to set. + */ + public void setIsanonymous(java.lang.Boolean value) { + this.isanonymous = value; + } + + /** + * Gets the value of the 'user' field. + * @return The value of the 'user' field. + */ + public java.lang.String getUser() { + return user; + } + + /** + * Sets the value of the 'user' field. + * @param value the value to set. + */ + public void setUser(java.lang.String value) { + this.user = value; + } + + /** + * Gets the value of the 'deltabucket' field. + * @return The value of the 'deltabucket' field. + */ + public java.lang.Double getDeltabucket() { + return deltabucket; + } + + /** + * Sets the value of the 'deltabucket' field. + * @param value the value to set. + */ + public void setDeltabucket(java.lang.Double value) { + this.deltabucket = value; + } + + /** + * Gets the value of the 'deleted' field. + * @return The value of the 'deleted' field. + */ + public java.lang.Long getDeleted() { + return deleted; + } + + /** + * Sets the value of the 'deleted' field. + * @param value the value to set. + */ + public void setDeleted(java.lang.Long value) { + this.deleted = value; + } + + /** + * Gets the value of the 'namespace' field. + * @return The value of the 'namespace' field. + */ + public java.lang.String getNamespace() { + return namespace; + } + + /** + * Sets the value of the 'namespace' field. + * @param value the value to set. + */ + public void setNamespace(java.lang.String value) { + this.namespace = value; + } + + /** + * Creates a new Wikipedia RecordBuilder. + * @return A new Wikipedia RecordBuilder + */ + public static org.apache.hive.kafka.Wikipedia.Builder newBuilder() { + return new org.apache.hive.kafka.Wikipedia.Builder(); + } + + /** + * Creates a new Wikipedia RecordBuilder by copying an existing Builder. + * @param other The existing builder to copy. + * @return A new Wikipedia RecordBuilder + */ + public static org.apache.hive.kafka.Wikipedia.Builder newBuilder(org.apache.hive.kafka.Wikipedia.Builder other) { + return new org.apache.hive.kafka.Wikipedia.Builder(other); + } + + /** + * Creates a new Wikipedia RecordBuilder by copying an existing Wikipedia instance. + * @param other The existing instance to copy. + * @return A new Wikipedia RecordBuilder + */ + public static org.apache.hive.kafka.Wikipedia.Builder newBuilder(org.apache.hive.kafka.Wikipedia other) { + return new org.apache.hive.kafka.Wikipedia.Builder(other); + } + + /** + * RecordBuilder for Wikipedia instances. + */ + public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase + implements org.apache.avro.data.RecordBuilder { + + private boolean isrobot; + private java.lang.String channel; + private java.lang.String timestamp; + private java.lang.String flags; + private boolean isunpatrolled; + private java.lang.String page; + private java.lang.String diffurl; + private long added; + private java.lang.String comment; + private long commentlength; + private boolean isnew; + private boolean isminor; + private long delta; + private boolean isanonymous; + private java.lang.String user; + private double deltabucket; + private long deleted; + private java.lang.String namespace; + + /** Creates a new Builder */ + private Builder() { + super(SCHEMA$); + } + + /** + * Creates a Builder by copying an existing Builder. + * @param other The existing Builder to copy. + */ + private Builder(org.apache.hive.kafka.Wikipedia.Builder other) { + super(other); + if (isValidValue(fields()[0], other.isrobot)) { + this.isrobot = data().deepCopy(fields()[0].schema(), other.isrobot); + fieldSetFlags()[0] = true; + } + if (isValidValue(fields()[1], other.channel)) { + this.channel = data().deepCopy(fields()[1].schema(), other.channel); + fieldSetFlags()[1] = true; + } + if (isValidValue(fields()[2], other.timestamp)) { + this.timestamp = data().deepCopy(fields()[2].schema(), other.timestamp); + fieldSetFlags()[2] = true; + } + if (isValidValue(fields()[3], other.flags)) { + this.flags = data().deepCopy(fields()[3].schema(), other.flags); + fieldSetFlags()[3] = true; + } + if (isValidValue(fields()[4], other.isunpatrolled)) { + this.isunpatrolled = data().deepCopy(fields()[4].schema(), other.isunpatrolled); + fieldSetFlags()[4] = true; + } + if (isValidValue(fields()[5], other.page)) { + this.page = data().deepCopy(fields()[5].schema(), other.page); + fieldSetFlags()[5] = true; + } + if (isValidValue(fields()[6], other.diffurl)) { + this.diffurl = data().deepCopy(fields()[6].schema(), other.diffurl); + fieldSetFlags()[6] = true; + } + if (isValidValue(fields()[7], other.added)) { + this.added = data().deepCopy(fields()[7].schema(), other.added); + fieldSetFlags()[7] = true; + } + if (isValidValue(fields()[8], other.comment)) { + this.comment = data().deepCopy(fields()[8].schema(), other.comment); + fieldSetFlags()[8] = true; + } + if (isValidValue(fields()[9], other.commentlength)) { + this.commentlength = data().deepCopy(fields()[9].schema(), other.commentlength); + fieldSetFlags()[9] = true; + } + if (isValidValue(fields()[10], other.isnew)) { + this.isnew = data().deepCopy(fields()[10].schema(), other.isnew); + fieldSetFlags()[10] = true; + } + if (isValidValue(fields()[11], other.isminor)) { + this.isminor = data().deepCopy(fields()[11].schema(), other.isminor); + fieldSetFlags()[11] = true; + } + if (isValidValue(fields()[12], other.delta)) { + this.delta = data().deepCopy(fields()[12].schema(), other.delta); + fieldSetFlags()[12] = true; + } + if (isValidValue(fields()[13], other.isanonymous)) { + this.isanonymous = data().deepCopy(fields()[13].schema(), other.isanonymous); + fieldSetFlags()[13] = true; + } + if (isValidValue(fields()[14], other.user)) { + this.user = data().deepCopy(fields()[14].schema(), other.user); + fieldSetFlags()[14] = true; + } + if (isValidValue(fields()[15], other.deltabucket)) { + this.deltabucket = data().deepCopy(fields()[15].schema(), other.deltabucket); + fieldSetFlags()[15] = true; + } + if (isValidValue(fields()[16], other.deleted)) { + this.deleted = data().deepCopy(fields()[16].schema(), other.deleted); + fieldSetFlags()[16] = true; + } + if (isValidValue(fields()[17], other.namespace)) { + this.namespace = data().deepCopy(fields()[17].schema(), other.namespace); + fieldSetFlags()[17] = true; + } + } + + /** + * Creates a Builder by copying an existing Wikipedia instance + * @param other The existing instance to copy. + */ + private Builder(org.apache.hive.kafka.Wikipedia other) { + super(SCHEMA$); + if (isValidValue(fields()[0], other.isrobot)) { + this.isrobot = data().deepCopy(fields()[0].schema(), other.isrobot); + fieldSetFlags()[0] = true; + } + if (isValidValue(fields()[1], other.channel)) { + this.channel = data().deepCopy(fields()[1].schema(), other.channel); + fieldSetFlags()[1] = true; + } + if (isValidValue(fields()[2], other.timestamp)) { + this.timestamp = data().deepCopy(fields()[2].schema(), other.timestamp); + fieldSetFlags()[2] = true; + } + if (isValidValue(fields()[3], other.flags)) { + this.flags = data().deepCopy(fields()[3].schema(), other.flags); + fieldSetFlags()[3] = true; + } + if (isValidValue(fields()[4], other.isunpatrolled)) { + this.isunpatrolled = data().deepCopy(fields()[4].schema(), other.isunpatrolled); + fieldSetFlags()[4] = true; + } + if (isValidValue(fields()[5], other.page)) { + this.page = data().deepCopy(fields()[5].schema(), other.page); + fieldSetFlags()[5] = true; + } + if (isValidValue(fields()[6], other.diffurl)) { + this.diffurl = data().deepCopy(fields()[6].schema(), other.diffurl); + fieldSetFlags()[6] = true; + } + if (isValidValue(fields()[7], other.added)) { + this.added = data().deepCopy(fields()[7].schema(), other.added); + fieldSetFlags()[7] = true; + } + if (isValidValue(fields()[8], other.comment)) { + this.comment = data().deepCopy(fields()[8].schema(), other.comment); + fieldSetFlags()[8] = true; + } + if (isValidValue(fields()[9], other.commentlength)) { + this.commentlength = data().deepCopy(fields()[9].schema(), other.commentlength); + fieldSetFlags()[9] = true; + } + if (isValidValue(fields()[10], other.isnew)) { + this.isnew = data().deepCopy(fields()[10].schema(), other.isnew); + fieldSetFlags()[10] = true; + } + if (isValidValue(fields()[11], other.isminor)) { + this.isminor = data().deepCopy(fields()[11].schema(), other.isminor); + fieldSetFlags()[11] = true; + } + if (isValidValue(fields()[12], other.delta)) { + this.delta = data().deepCopy(fields()[12].schema(), other.delta); + fieldSetFlags()[12] = true; + } + if (isValidValue(fields()[13], other.isanonymous)) { + this.isanonymous = data().deepCopy(fields()[13].schema(), other.isanonymous); + fieldSetFlags()[13] = true; + } + if (isValidValue(fields()[14], other.user)) { + this.user = data().deepCopy(fields()[14].schema(), other.user); + fieldSetFlags()[14] = true; + } + if (isValidValue(fields()[15], other.deltabucket)) { + this.deltabucket = data().deepCopy(fields()[15].schema(), other.deltabucket); + fieldSetFlags()[15] = true; + } + if (isValidValue(fields()[16], other.deleted)) { + this.deleted = data().deepCopy(fields()[16].schema(), other.deleted); + fieldSetFlags()[16] = true; + } + if (isValidValue(fields()[17], other.namespace)) { + this.namespace = data().deepCopy(fields()[17].schema(), other.namespace); + fieldSetFlags()[17] = true; + } + } + + /** + * Gets the value of the 'isrobot' field. + * @return The value. + */ + public java.lang.Boolean getIsrobot() { + return isrobot; + } + + /** + * Sets the value of the 'isrobot' field. + * @param value The value of 'isrobot'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setIsrobot(boolean value) { + validate(fields()[0], value); + this.isrobot = value; + fieldSetFlags()[0] = true; + return this; + } + + /** + * Checks whether the 'isrobot' field has been set. + * @return True if the 'isrobot' field has been set, false otherwise. + */ + public boolean hasIsrobot() { + return fieldSetFlags()[0]; + } + + + /** + * Clears the value of the 'isrobot' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearIsrobot() { + fieldSetFlags()[0] = false; + return this; + } + + /** + * Gets the value of the 'channel' field. + * @return The value. + */ + public java.lang.String getChannel() { + return channel; + } + + /** + * Sets the value of the 'channel' field. + * @param value The value of 'channel'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setChannel(java.lang.String value) { + validate(fields()[1], value); + this.channel = value; + fieldSetFlags()[1] = true; + return this; + } + + /** + * Checks whether the 'channel' field has been set. + * @return True if the 'channel' field has been set, false otherwise. + */ + public boolean hasChannel() { + return fieldSetFlags()[1]; + } + + + /** + * Clears the value of the 'channel' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearChannel() { + channel = null; + fieldSetFlags()[1] = false; + return this; + } + + /** + * Gets the value of the 'timestamp' field. + * @return The value. + */ + public java.lang.String getTimestamp() { + return timestamp; + } + + /** + * Sets the value of the 'timestamp' field. + * @param value The value of 'timestamp'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setTimestamp(java.lang.String value) { + validate(fields()[2], value); + this.timestamp = value; + fieldSetFlags()[2] = true; + return this; + } + + /** + * Checks whether the 'timestamp' field has been set. + * @return True if the 'timestamp' field has been set, false otherwise. + */ + public boolean hasTimestamp() { + return fieldSetFlags()[2]; + } + + + /** + * Clears the value of the 'timestamp' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearTimestamp() { + timestamp = null; + fieldSetFlags()[2] = false; + return this; + } + + /** + * Gets the value of the 'flags' field. + * @return The value. + */ + public java.lang.String getFlags() { + return flags; + } + + /** + * Sets the value of the 'flags' field. + * @param value The value of 'flags'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setFlags(java.lang.String value) { + validate(fields()[3], value); + this.flags = value; + fieldSetFlags()[3] = true; + return this; + } + + /** + * Checks whether the 'flags' field has been set. + * @return True if the 'flags' field has been set, false otherwise. + */ + public boolean hasFlags() { + return fieldSetFlags()[3]; + } + + + /** + * Clears the value of the 'flags' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearFlags() { + flags = null; + fieldSetFlags()[3] = false; + return this; + } + + /** + * Gets the value of the 'isunpatrolled' field. + * @return The value. + */ + public java.lang.Boolean getIsunpatrolled() { + return isunpatrolled; + } + + /** + * Sets the value of the 'isunpatrolled' field. + * @param value The value of 'isunpatrolled'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setIsunpatrolled(boolean value) { + validate(fields()[4], value); + this.isunpatrolled = value; + fieldSetFlags()[4] = true; + return this; + } + + /** + * Checks whether the 'isunpatrolled' field has been set. + * @return True if the 'isunpatrolled' field has been set, false otherwise. + */ + public boolean hasIsunpatrolled() { + return fieldSetFlags()[4]; + } + + + /** + * Clears the value of the 'isunpatrolled' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearIsunpatrolled() { + fieldSetFlags()[4] = false; + return this; + } + + /** + * Gets the value of the 'page' field. + * @return The value. + */ + public java.lang.String getPage() { + return page; + } + + /** + * Sets the value of the 'page' field. + * @param value The value of 'page'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setPage(java.lang.String value) { + validate(fields()[5], value); + this.page = value; + fieldSetFlags()[5] = true; + return this; + } + + /** + * Checks whether the 'page' field has been set. + * @return True if the 'page' field has been set, false otherwise. + */ + public boolean hasPage() { + return fieldSetFlags()[5]; + } + + + /** + * Clears the value of the 'page' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearPage() { + page = null; + fieldSetFlags()[5] = false; + return this; + } + + /** + * Gets the value of the 'diffurl' field. + * @return The value. + */ + public java.lang.String getDiffurl() { + return diffurl; + } + + /** + * Sets the value of the 'diffurl' field. + * @param value The value of 'diffurl'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setDiffurl(java.lang.String value) { + validate(fields()[6], value); + this.diffurl = value; + fieldSetFlags()[6] = true; + return this; + } + + /** + * Checks whether the 'diffurl' field has been set. + * @return True if the 'diffurl' field has been set, false otherwise. + */ + public boolean hasDiffurl() { + return fieldSetFlags()[6]; + } + + + /** + * Clears the value of the 'diffurl' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearDiffurl() { + diffurl = null; + fieldSetFlags()[6] = false; + return this; + } + + /** + * Gets the value of the 'added' field. + * @return The value. + */ + public java.lang.Long getAdded() { + return added; + } + + /** + * Sets the value of the 'added' field. + * @param value The value of 'added'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setAdded(long value) { + validate(fields()[7], value); + this.added = value; + fieldSetFlags()[7] = true; + return this; + } + + /** + * Checks whether the 'added' field has been set. + * @return True if the 'added' field has been set, false otherwise. + */ + public boolean hasAdded() { + return fieldSetFlags()[7]; + } + + + /** + * Clears the value of the 'added' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearAdded() { + fieldSetFlags()[7] = false; + return this; + } + + /** + * Gets the value of the 'comment' field. + * @return The value. + */ + public java.lang.String getComment() { + return comment; + } + + /** + * Sets the value of the 'comment' field. + * @param value The value of 'comment'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setComment(java.lang.String value) { + validate(fields()[8], value); + this.comment = value; + fieldSetFlags()[8] = true; + return this; + } + + /** + * Checks whether the 'comment' field has been set. + * @return True if the 'comment' field has been set, false otherwise. + */ + public boolean hasComment() { + return fieldSetFlags()[8]; + } + + + /** + * Clears the value of the 'comment' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearComment() { + comment = null; + fieldSetFlags()[8] = false; + return this; + } + + /** + * Gets the value of the 'commentlength' field. + * @return The value. + */ + public java.lang.Long getCommentlength() { + return commentlength; + } + + /** + * Sets the value of the 'commentlength' field. + * @param value The value of 'commentlength'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setCommentlength(long value) { + validate(fields()[9], value); + this.commentlength = value; + fieldSetFlags()[9] = true; + return this; + } + + /** + * Checks whether the 'commentlength' field has been set. + * @return True if the 'commentlength' field has been set, false otherwise. + */ + public boolean hasCommentlength() { + return fieldSetFlags()[9]; + } + + + /** + * Clears the value of the 'commentlength' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearCommentlength() { + fieldSetFlags()[9] = false; + return this; + } + + /** + * Gets the value of the 'isnew' field. + * @return The value. + */ + public java.lang.Boolean getIsnew() { + return isnew; + } + + /** + * Sets the value of the 'isnew' field. + * @param value The value of 'isnew'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setIsnew(boolean value) { + validate(fields()[10], value); + this.isnew = value; + fieldSetFlags()[10] = true; + return this; + } + + /** + * Checks whether the 'isnew' field has been set. + * @return True if the 'isnew' field has been set, false otherwise. + */ + public boolean hasIsnew() { + return fieldSetFlags()[10]; + } + + + /** + * Clears the value of the 'isnew' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearIsnew() { + fieldSetFlags()[10] = false; + return this; + } + + /** + * Gets the value of the 'isminor' field. + * @return The value. + */ + public java.lang.Boolean getIsminor() { + return isminor; + } + + /** + * Sets the value of the 'isminor' field. + * @param value The value of 'isminor'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setIsminor(boolean value) { + validate(fields()[11], value); + this.isminor = value; + fieldSetFlags()[11] = true; + return this; + } + + /** + * Checks whether the 'isminor' field has been set. + * @return True if the 'isminor' field has been set, false otherwise. + */ + public boolean hasIsminor() { + return fieldSetFlags()[11]; + } + + + /** + * Clears the value of the 'isminor' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearIsminor() { + fieldSetFlags()[11] = false; + return this; + } + + /** + * Gets the value of the 'delta' field. + * @return The value. + */ + public java.lang.Long getDelta() { + return delta; + } + + /** + * Sets the value of the 'delta' field. + * @param value The value of 'delta'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setDelta(long value) { + validate(fields()[12], value); + this.delta = value; + fieldSetFlags()[12] = true; + return this; + } + + /** + * Checks whether the 'delta' field has been set. + * @return True if the 'delta' field has been set, false otherwise. + */ + public boolean hasDelta() { + return fieldSetFlags()[12]; + } + + + /** + * Clears the value of the 'delta' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearDelta() { + fieldSetFlags()[12] = false; + return this; + } + + /** + * Gets the value of the 'isanonymous' field. + * @return The value. + */ + public java.lang.Boolean getIsanonymous() { + return isanonymous; + } + + /** + * Sets the value of the 'isanonymous' field. + * @param value The value of 'isanonymous'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setIsanonymous(boolean value) { + validate(fields()[13], value); + this.isanonymous = value; + fieldSetFlags()[13] = true; + return this; + } + + /** + * Checks whether the 'isanonymous' field has been set. + * @return True if the 'isanonymous' field has been set, false otherwise. + */ + public boolean hasIsanonymous() { + return fieldSetFlags()[13]; + } + + + /** + * Clears the value of the 'isanonymous' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearIsanonymous() { + fieldSetFlags()[13] = false; + return this; + } + + /** + * Gets the value of the 'user' field. + * @return The value. + */ + public java.lang.String getUser() { + return user; + } + + /** + * Sets the value of the 'user' field. + * @param value The value of 'user'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setUser(java.lang.String value) { + validate(fields()[14], value); + this.user = value; + fieldSetFlags()[14] = true; + return this; + } + + /** + * Checks whether the 'user' field has been set. + * @return True if the 'user' field has been set, false otherwise. + */ + public boolean hasUser() { + return fieldSetFlags()[14]; + } + + + /** + * Clears the value of the 'user' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearUser() { + user = null; + fieldSetFlags()[14] = false; + return this; + } + + /** + * Gets the value of the 'deltabucket' field. + * @return The value. + */ + public java.lang.Double getDeltabucket() { + return deltabucket; + } + + /** + * Sets the value of the 'deltabucket' field. + * @param value The value of 'deltabucket'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setDeltabucket(double value) { + validate(fields()[15], value); + this.deltabucket = value; + fieldSetFlags()[15] = true; + return this; + } + + /** + * Checks whether the 'deltabucket' field has been set. + * @return True if the 'deltabucket' field has been set, false otherwise. + */ + public boolean hasDeltabucket() { + return fieldSetFlags()[15]; + } + + + /** + * Clears the value of the 'deltabucket' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearDeltabucket() { + fieldSetFlags()[15] = false; + return this; + } + + /** + * Gets the value of the 'deleted' field. + * @return The value. + */ + public java.lang.Long getDeleted() { + return deleted; + } + + /** + * Sets the value of the 'deleted' field. + * @param value The value of 'deleted'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setDeleted(long value) { + validate(fields()[16], value); + this.deleted = value; + fieldSetFlags()[16] = true; + return this; + } + + /** + * Checks whether the 'deleted' field has been set. + * @return True if the 'deleted' field has been set, false otherwise. + */ + public boolean hasDeleted() { + return fieldSetFlags()[16]; + } + + + /** + * Clears the value of the 'deleted' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearDeleted() { + fieldSetFlags()[16] = false; + return this; + } + + /** + * Gets the value of the 'namespace' field. + * @return The value. + */ + public java.lang.String getNamespace() { + return namespace; + } + + /** + * Sets the value of the 'namespace' field. + * @param value The value of 'namespace'. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder setNamespace(java.lang.String value) { + validate(fields()[17], value); + this.namespace = value; + fieldSetFlags()[17] = true; + return this; + } + + /** + * Checks whether the 'namespace' field has been set. + * @return True if the 'namespace' field has been set, false otherwise. + */ + public boolean hasNamespace() { + return fieldSetFlags()[17]; + } + + + /** + * Clears the value of the 'namespace' field. + * @return This builder. + */ + public org.apache.hive.kafka.Wikipedia.Builder clearNamespace() { + namespace = null; + fieldSetFlags()[17] = false; + return this; + } + + @Override + @SuppressWarnings("unchecked") + public Wikipedia build() { + try { + Wikipedia record = new Wikipedia(); + record.isrobot = fieldSetFlags()[0] ? this.isrobot : (java.lang.Boolean) defaultValue(fields()[0]); + record.channel = fieldSetFlags()[1] ? this.channel : (java.lang.String) defaultValue(fields()[1]); + record.timestamp = fieldSetFlags()[2] ? this.timestamp : (java.lang.String) defaultValue(fields()[2]); + record.flags = fieldSetFlags()[3] ? this.flags : (java.lang.String) defaultValue(fields()[3]); + record.isunpatrolled = fieldSetFlags()[4] ? this.isunpatrolled : (java.lang.Boolean) defaultValue(fields()[4]); + record.page = fieldSetFlags()[5] ? this.page : (java.lang.String) defaultValue(fields()[5]); + record.diffurl = fieldSetFlags()[6] ? this.diffurl : (java.lang.String) defaultValue(fields()[6]); + record.added = fieldSetFlags()[7] ? this.added : (java.lang.Long) defaultValue(fields()[7]); + record.comment = fieldSetFlags()[8] ? this.comment : (java.lang.String) defaultValue(fields()[8]); + record.commentlength = fieldSetFlags()[9] ? this.commentlength : (java.lang.Long) defaultValue(fields()[9]); + record.isnew = fieldSetFlags()[10] ? this.isnew : (java.lang.Boolean) defaultValue(fields()[10]); + record.isminor = fieldSetFlags()[11] ? this.isminor : (java.lang.Boolean) defaultValue(fields()[11]); + record.delta = fieldSetFlags()[12] ? this.delta : (java.lang.Long) defaultValue(fields()[12]); + record.isanonymous = fieldSetFlags()[13] ? this.isanonymous : (java.lang.Boolean) defaultValue(fields()[13]); + record.user = fieldSetFlags()[14] ? this.user : (java.lang.String) defaultValue(fields()[14]); + record.deltabucket = fieldSetFlags()[15] ? this.deltabucket : (java.lang.Double) defaultValue(fields()[15]); + record.deleted = fieldSetFlags()[16] ? this.deleted : (java.lang.Long) defaultValue(fields()[16]); + record.namespace = fieldSetFlags()[17] ? this.namespace : (java.lang.String) defaultValue(fields()[17]); + return record; + } catch (java.lang.Exception e) { + throw new org.apache.avro.AvroRuntimeException(e); + } + } + } + + @SuppressWarnings("unchecked") + private static final org.apache.avro.io.DatumWriter + WRITER$ = (org.apache.avro.io.DatumWriter)MODEL$.createDatumWriter(SCHEMA$); + + @Override public void writeExternal(java.io.ObjectOutput out) + throws java.io.IOException { + WRITER$.write(this, SpecificData.getEncoder(out)); + } + + @SuppressWarnings("unchecked") + private static final org.apache.avro.io.DatumReader + READER$ = (org.apache.avro.io.DatumReader)MODEL$.createDatumReader(SCHEMA$); + + @Override public void readExternal(java.io.ObjectInput in) + throws java.io.IOException { + READER$.read(this, SpecificData.getDecoder(in)); + } + +} diff --git itests/qtest/pom.xml itests/qtest/pom.xml index 5767806017..801a43d02f 100644 --- itests/qtest/pom.xml +++ itests/qtest/pom.xml @@ -138,7 +138,12 @@ ${project.version} test - + + org.apache.hive + kafka-handler + ${project.version} + test + diff --git itests/qtest/src/test/java/org/apache/hadoop/hive/cli/TestMiniDruidKafkaCliDriver.java itests/qtest/src/test/java/org/apache/hadoop/hive/cli/TestMiniDruidKafkaCliDriver.java index c54b2bf63a..4768975225 100644 --- itests/qtest/src/test/java/org/apache/hadoop/hive/cli/TestMiniDruidKafkaCliDriver.java +++ itests/qtest/src/test/java/org/apache/hadoop/hive/cli/TestMiniDruidKafkaCliDriver.java @@ -21,7 +21,6 @@ import org.apache.hadoop.hive.cli.control.CliConfigs; import org.junit.ClassRule; -import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestRule; @@ -56,7 +55,6 @@ public TestMiniDruidKafkaCliDriver(String name, File qfile) { this.qfile = qfile; } - @Ignore("HIVE-19509: Disable tests that are failing continuously") @Test public void testCliDriver() throws Exception { adapter.runTest(name, qfile); diff --git itests/src/test/resources/testconfiguration.properties itests/src/test/resources/testconfiguration.properties index af7f089975..9ec28f50e2 100644 --- itests/src/test/resources/testconfiguration.properties +++ itests/src/test/resources/testconfiguration.properties @@ -1725,7 +1725,8 @@ druid.query.files=druidmini_test1.q,\ druid.llap.local.query.files=druidmini_noop.q -druid.kafka.query.files=druidkafkamini_basic.q +druid.kafka.query.files=druidkafkamini_basic.q \ + kafka_storage_handler.q # tests to be run by TestErasureCodingHDFSCliDriver and TestCliDriver erasurecoding.shared.query.files=erasure_commands.q diff --git itests/util/src/main/java/org/apache/hadoop/hive/cli/control/CliConfigs.java itests/util/src/main/java/org/apache/hadoop/hive/cli/control/CliConfigs.java index 92919e9daf..491b6db581 100644 --- itests/util/src/main/java/org/apache/hadoop/hive/cli/control/CliConfigs.java +++ itests/util/src/main/java/org/apache/hadoop/hive/cli/control/CliConfigs.java @@ -202,6 +202,7 @@ public MiniDruidKafkaCliConfig() { setQueryDir("ql/src/test/queries/clientpositive"); includesFrom(testConfigProps, "druid.kafka.query.files"); + excludeQuery("druidkafkamini_basic.q"); // HIVE-19509 setResultsDir("ql/src/test/results/clientpositive/druid"); setLogDir("itests/qtest/target/tmp/log"); diff --git itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java index 5adbb63693..8f9e4eccb8 100644 --- itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java +++ itests/util/src/main/java/org/apache/hadoop/hive/ql/QTestUtil.java @@ -39,6 +39,8 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.sql.SQLException; +import java.sql.Timestamp; +import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -53,7 +55,14 @@ import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; - +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.BinaryEncoder; +import org.apache.avro.io.DatumWriter; +import org.apache.avro.io.EncoderFactory; +import org.apache.avro.specific.SpecificDatumWriter; import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.commons.lang.StringUtils; @@ -112,6 +121,7 @@ import org.apache.hive.common.util.StreamPrinter; import org.apache.hive.druid.MiniDruidCluster; import org.apache.hive.kafka.SingleNodeKafkaCluster; +import org.apache.hive.kafka.Wikipedia; import org.apache.logging.log4j.util.Strings; import org.apache.tools.ant.BuildException; import org.apache.zookeeper.WatchedEvent; @@ -143,6 +153,7 @@ static final Logger LOG = LoggerFactory.getLogger("QTestUtil"); private final static String defaultInitScript = "q_test_init.sql"; private final static String defaultCleanupScript = "q_test_cleanup.sql"; + private static SimpleDateFormat formatter = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss"); private final String[] testOnlyCommands = new String[]{"crypto", "erasure"}; public static final String TEST_TMP_DIR_PROPERTY = "test.tmp.dir"; // typically target/tmp @@ -671,6 +682,7 @@ private void setupMiniCluster(HadoopShims shims, String confDir) throws "test-topic", new File(getScriptsDir(), "kafka_init_data.json") ); + kafkaCluster.createTopicWithData("wiki_kafka_avro_table", getAvroRows()); } if (clusterType.getCoreClusterType() == CoreClusterType.TEZ) { @@ -709,6 +721,48 @@ private void setupMiniCluster(HadoopShims shims, String confDir) throws } } + private static List getAvroRows() { + int numRows = 10; + List events; + final DatumWriter writer = new SpecificDatumWriter<>(Wikipedia.getClassSchema()); + events = + IntStream.rangeClosed(0, numRows) + .mapToObj(i -> Wikipedia.newBuilder() + // 1534736225090 -> 08/19/2018 20:37:05 + .setTimestamp(formatter.format(new Timestamp(1534736225090L + 1000 * 3600 * i))) + .setAdded(i * 300) + .setDeleted(-i) + .setIsrobot(i % 2 == 0) + .setChannel("chanel number " + i) + .setComment("comment number " + i) + .setCommentlength(i) + .setDiffurl(String.format("url %s", i)) + .setFlags("flag") + .setIsminor(i % 2 > 0) + .setIsanonymous(i % 3 != 0) + .setNamespace("namespace") + .setIsunpatrolled(new Boolean(i % 3 == 0)) + .setIsnew(new Boolean(i % 2 > 0)) + .setPage(String.format("page is %s", i * 100)) + .setDelta(i) + .setDeltabucket(i * 100.4) + .setUser("test-user-" + i) + .build()) + .map(genericRecord -> { + java.io.ByteArrayOutputStream out = new java.io.ByteArrayOutputStream(); + BinaryEncoder encoder = EncoderFactory.get().binaryEncoder(out, null); + try { + writer.write(genericRecord, encoder); + encoder.flush(); + out.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + return out.toByteArray(); + }) + .collect(Collectors.toList()); + return events; + } public void shutdown() throws Exception { if (System.getenv(QTEST_LEAVE_FILES) == null) { diff --git kafka-handler/pom.xml kafka-handler/pom.xml new file mode 100644 index 0000000000..6c58bf1df1 --- /dev/null +++ kafka-handler/pom.xml @@ -0,0 +1,160 @@ + + + + + + org.apache.hive + hive + 4.0.0-SNAPSHOT + ../pom.xml + + 4.0.0 + + + .. + 1.0.1 + + + kafka-handler + jar + Hive Kafka Storage Handler + + + + + org.apache.hive + hive-exec + provided + ${project.version} + + + com.google.guava + guava + + + org.apache.hadoop + hadoop-common + + + org.apache.hadoop + hadoop-client + + + org.apache.kafka + kafka-clients + ${kafka.version} + + + + junit + junit + ${junit.version} + test + + + org.apache.kafka + kafka-clients + ${kafka.version} + test + test + + + + org.apache.kafka + kafka_2.11 + ${kafka.version} + test + test + + + org.apache.kafka + kafka_2.11 + ${kafka.version} + test + + + + + + dev-fast-build + + + skipShade + !true + + + + + + org.apache.maven.plugins + maven-shade-plugin + ${maven.shade.plugin.version} + + + package + + shade + + + true + false + + + org.apache.kafka:* + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + static/ + + + + + + + + + + + + + ${basedir}/src/java + ${basedir}/src/test + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + + + + + + \ No newline at end of file diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/GenericKafkaSerDe.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/GenericKafkaSerDe.java new file mode 100644 index 0000000000..e7ea53f4bc --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/GenericKafkaSerDe.java @@ -0,0 +1,192 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.google.common.base.Preconditions; +import com.google.common.base.Supplier; +import com.google.common.base.Suppliers; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.DecoderFactory; +import org.apache.avro.specific.SpecificDatumReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.hive.serde2.JsonSerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.avro.AvroGenericRecordWritable; +import org.apache.hadoop.hive.serde2.avro.AvroSerDe; +import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.rmi.server.UID; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; +import java.util.stream.Collectors; + +/** + * Generic Kafka Serde that allow user to delegate Serde to other class like Avro, + * Json or any class that supports {@link BytesWritable}. + */ +public class GenericKafkaSerDe extends AbstractSerDe { + private static final Logger LOG = LoggerFactory.getLogger(GenericKafkaSerDe.class); + // ORDER of fields and types matters here + private static final ImmutableList + METADATA_COLUMN_NAMES = + ImmutableList.of(KafkaStreamingUtils.PARTITION_COLUMN, + KafkaStreamingUtils.OFFSET_COLUMN, + KafkaStreamingUtils.TIMESTAMP_COLUMN, + KafkaStreamingUtils.START_OFFSET_COLUMN, + KafkaStreamingUtils.END_OFFSET_COLUMN); + private static final ImmutableList + METADATA_PRIMITIVE_TYPE_INFO = + ImmutableList.of(TypeInfoFactory.intTypeInfo, + TypeInfoFactory.longTypeInfo, + TypeInfoFactory.longTypeInfo, + TypeInfoFactory.longTypeInfo, + TypeInfoFactory.longTypeInfo); + + private AbstractSerDe delegateSerDe; + private ObjectInspector objectInspector; + private final List columnNames = Lists.newArrayList(); + private StructObjectInspector delegateObjectInspector; + private final UID uid = new UID(); + @SuppressWarnings("Guava") private Supplier> gdrSupplier; + + @Override public void initialize(@Nullable Configuration conf, Properties tbl) throws SerDeException { + final String className = tbl.getProperty(KafkaStreamingUtils.SERDE_CLASS_NAME, KafkaJsonSerDe.class.getName()); + delegateSerDe = KafkaStreamingUtils.createDelegate(className); + //noinspection deprecation + delegateSerDe.initialize(conf, tbl); + LOG.debug("Using SerDe instance {}", delegateSerDe.getClass().getCanonicalName()); + + if (!(delegateSerDe.getObjectInspector() instanceof StructObjectInspector)) { + throw new SerDeException("Was expecting StructObject Inspector but have " + delegateSerDe.getObjectInspector() + .getClass() + .getName()); + } + delegateObjectInspector = (StructObjectInspector) delegateSerDe.getObjectInspector(); + + // Build column names Order matters here + columnNames.addAll(delegateObjectInspector.getAllStructFieldRefs() + .stream() + .map(StructField::getFieldName) + .collect(Collectors.toList())); + columnNames.addAll(METADATA_COLUMN_NAMES); + + final List inspectors = new ArrayList<>(columnNames.size()); + inspectors.addAll(delegateObjectInspector.getAllStructFieldRefs() + .stream() + .map(StructField::getFieldObjectInspector) + .collect(Collectors.toList())); + inspectors.addAll(METADATA_PRIMITIVE_TYPE_INFO.stream() + .map(KafkaJsonSerDe.typeInfoToObjectInspector) + .collect(Collectors.toList())); + objectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors); + + // lazy supplier to read Avro Records if needed + gdrSupplier = getReaderSupplier(tbl); + } + + @Override public Class getSerializedClass() { + return delegateSerDe.getSerializedClass(); + } + + @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { + return delegateSerDe.serialize(obj, objInspector); + } + + @Override public SerDeStats getSerDeStats() { + return delegateSerDe.getSerDeStats(); + } + + @Override public Object deserialize(Writable blob) throws SerDeException { + KafkaRecordWritable record = (KafkaRecordWritable) blob; + // switch case the serde nature + final Object row; + if (delegateSerDe instanceof JsonSerDe) { + //@TODO Text constructor copies the data, this op is not needed + row = delegateSerDe.deserialize(new Text(record.getValue())); + } else if (delegateSerDe instanceof AvroSerDe) { + AvroGenericRecordWritable avroGenericRecordWritable = new AvroGenericRecordWritable(); + GenericRecord avroRecord; + try { + avroRecord = gdrSupplier.get().read(null, DecoderFactory.get().binaryDecoder(record.getValue(), null)); + avroGenericRecordWritable.setRecord(avroRecord); + avroGenericRecordWritable.setRecordReaderID(uid); + avroGenericRecordWritable.setFileSchema(avroRecord.getSchema()); + } catch (IOException e) { + throw new SerDeException(e); + } + row = delegateSerDe.deserialize(avroGenericRecordWritable); + } else { + // default assuming delegate Serde know how to deal with + row = delegateSerDe.deserialize(new BytesWritable(record.getValue())); + } + + return columnNames.stream().map(name -> { + switch (name) { + case KafkaStreamingUtils.PARTITION_COLUMN: + return new IntWritable(record.getPartition()); + case KafkaStreamingUtils.OFFSET_COLUMN: + return new LongWritable(record.getOffset()); + case KafkaStreamingUtils.TIMESTAMP_COLUMN: + return new LongWritable(record.getTimestamp()); + case KafkaStreamingUtils.START_OFFSET_COLUMN: + return new LongWritable(record.getStartOffset()); + case KafkaStreamingUtils.END_OFFSET_COLUMN: + return new LongWritable(record.getEndOffset()); + default: + return delegateObjectInspector.getStructFieldData(row, delegateObjectInspector.getStructFieldRef(name)); + } + }).collect(Collectors.toList()); + } + + @Override public ObjectInspector getObjectInspector() { + return objectInspector; + } + + @SuppressWarnings("Guava") private Supplier> getReaderSupplier(Properties tbl) { + return Suppliers.memoize(() -> { + String schemaFromProperty = tbl.getProperty(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName(), ""); + Preconditions.checkArgument(!schemaFromProperty.isEmpty(), "Avro Schema is empty Can not go further"); + Schema schema = AvroSerdeUtils.getSchemaFor(schemaFromProperty); + LOG.debug("Building Avro Reader with schema {}", schemaFromProperty); + return new SpecificDatumReader<>(schema); + }); + } +} diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaJsonSerDe.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaJsonSerDe.java new file mode 100644 index 0000000000..f383190083 --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaJsonSerDe.java @@ -0,0 +1,248 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.common.type.HiveChar; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.common.type.HiveVarchar; +import org.apache.hadoop.hive.common.type.TimestampTZ; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.SerDeUtils; +import org.apache.hadoop.hive.serde2.columnar.BytesRefWritable; +import org.apache.hadoop.hive.serde2.io.ByteWritable; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.io.HiveCharWritable; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; +import org.apache.hadoop.hive.serde2.io.ShortWritable; +import org.apache.hadoop.hive.serde2.io.TimestampLocalTZWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TimestampLocalTZTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.BooleanWritable; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.joda.time.format.DateTimeFormatter; +import org.joda.time.format.DateTimeFormatterBuilder; +import org.joda.time.format.DateTimeParser; +import org.joda.time.format.ISODateTimeFormat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.time.Instant; +import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.function.Function; +import java.util.stream.Collectors; + +/** + * Basic JsonSerDe to make use of such storage handler smooth and easy and testing basic primitive Json. + * For production please use Hive native JsonSerde. + */ +public class KafkaJsonSerDe extends AbstractSerDe { + private static final Logger LOG = LoggerFactory.getLogger(KafkaJsonSerDe.class); + private static final DateTimeFormatter TS_PARSER = createAutoParser(); + static Function + typeInfoToObjectInspector = typeInfo -> + PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + TypeInfoFactory.getPrimitiveTypeInfo(typeInfo.getTypeName())); + private List columnNames; + private List columnTypes; + private ObjectInspector inspector; + private final ObjectMapper mapper = new ObjectMapper(); + private long rowCount = 0L; + private long rawDataSize = 0L; + + @Override public void initialize(@Nullable Configuration conf, Properties tbl) { + final List inspectors; + // Get column names and types + String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS); + String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); + final String + columnNameDelimiter = + tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? + tbl.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : + String.valueOf(SerDeUtils.COMMA); + // all table column names + if (!columnNameProperty.isEmpty()) { + columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter)); + } + // all column types + if (!columnTypeProperty.isEmpty()) { + columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + } + + if (LOG.isDebugEnabled()) { + LOG.debug("columns: {}, {}", columnNameProperty, columnNames); + LOG.debug("types: {}, {} ", columnTypeProperty, columnTypes); + } + + inspectors = columnTypes.stream().map(typeInfoToObjectInspector).collect(Collectors.toList()); + inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors); + } + + @Override public Class getSerializedClass() { + return BytesRefWritable.class; + } + + @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { + throw new SerDeException("unimplemented"); + } + + @Override public SerDeStats getSerDeStats() { + SerDeStats serDeStats = new SerDeStats(); + serDeStats.setRawDataSize(rawDataSize); + serDeStats.setRowCount(rowCount); + return serDeStats; + } + + @Override public Object deserialize(Writable blob) throws SerDeException { + BytesWritable record = (BytesWritable) blob; + Map payload; + try { + payload = parseAsJson(record.getBytes()); + rowCount += 1; + rawDataSize += record.getLength(); + } catch (IOException e) { + throw new SerDeException(e); + } + + final List output = new ArrayList<>(columnNames.size()); + + for (int i = 0; i < columnNames.size(); i++) { + final String name = columnNames.get(i); + final TypeInfo typeInfo = columnTypes.get(i); + final JsonNode value = payload.get(name); + if (value == null) { + output.add(null); + } else { + switch (columnTypes.get(i).getCategory()) { + case PRIMITIVE: + output.add(parseAsPrimitive(value, typeInfo)); + break; + case MAP: + case LIST: + case UNION: + case STRUCT: + default: + throw new SerDeException("not supported yet"); + } + } + + } + return output; + } + + private Object parseAsPrimitive(JsonNode value, TypeInfo typeInfo) throws SerDeException { + switch (TypeInfoFactory.getPrimitiveTypeInfo(typeInfo.getTypeName()).getPrimitiveCategory()) { + case TIMESTAMP: + TimestampWritable timestampWritable = new TimestampWritable(); + timestampWritable.setTime(TS_PARSER.parseMillis(value.textValue())); + return timestampWritable; + + case TIMESTAMPLOCALTZ: + final long numberOfMillis = TS_PARSER.parseMillis(value.textValue()); + return new TimestampLocalTZWritable(new TimestampTZ(ZonedDateTime.ofInstant(Instant.ofEpochMilli(numberOfMillis), + ((TimestampLocalTZTypeInfo) typeInfo).timeZone()))); + + case BYTE: + return new ByteWritable((byte) value.intValue()); + case SHORT: + return (new ShortWritable(value.shortValue())); + case INT: + return new IntWritable(value.intValue()); + case LONG: + return (new LongWritable((value.longValue()))); + case FLOAT: + return (new FloatWritable(value.floatValue())); + case DOUBLE: + return (new DoubleWritable(value.doubleValue())); + case DECIMAL: + return (new HiveDecimalWritable(HiveDecimal.create(value.decimalValue()))); + case CHAR: + return (new HiveCharWritable(new HiveChar(value.textValue(), ((CharTypeInfo) typeInfo).getLength()))); + case VARCHAR: + return (new HiveVarcharWritable(new HiveVarchar(value.textValue(), ((CharTypeInfo) typeInfo).getLength()))); + case STRING: + return (new Text(value.textValue())); + case BOOLEAN: + return (new BooleanWritable(value.isBoolean() ? value.booleanValue() : Boolean.valueOf(value.textValue()))); + default: + throw new SerDeException("Unknown type: " + typeInfo.getTypeName()); + } + } + + private Map parseAsJson(byte[] value) throws IOException { + JsonNode document = mapper.readValue(value, JsonNode.class); + //Hive Column names are case insensitive. + Map documentMap = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); + document.fields().forEachRemaining(field -> documentMap.put(field.getKey().toLowerCase(), field.getValue())); + return documentMap; + } + + @Override public ObjectInspector getObjectInspector() throws SerDeException { + if (inspector == null) { + throw new SerDeException("null inspector ??"); + } + return inspector; + } + + private static DateTimeFormatter createAutoParser() { + final DateTimeFormatter + offsetElement = + new DateTimeFormatterBuilder().appendTimeZoneOffset("Z", true, 2, 4).toFormatter(); + + DateTimeParser + timeOrOffset = + new DateTimeFormatterBuilder().append(null, + new DateTimeParser[] { + new DateTimeFormatterBuilder().appendLiteral('T').toParser(), + new DateTimeFormatterBuilder().appendLiteral(' ').toParser() }) + .appendOptional(ISODateTimeFormat.timeElementParser().getParser()) + .appendOptional(offsetElement.getParser()) + .toParser(); + + return new DateTimeFormatterBuilder().append(ISODateTimeFormat.dateElementParser()) + .appendOptional(timeOrOffset) + .toFormatter(); + } +} diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaPullerInputFormat.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaPullerInputFormat.java new file mode 100644 index 0000000000..2d5637d430 --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaPullerInputFormat.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.google.common.collect.ImmutableMap; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.SerializationUtilities; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapreduce.InputFormat; +import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.PartitionInfo; +import org.apache.kafka.common.TopicPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.stream.Collectors; + +/** + * Kafka puller input format to read records from a Kafka Queue. + * The input split will contain the set of topic partition and start/end offsets. + * Records will be returned as bytes array. + */ +public class KafkaPullerInputFormat extends InputFormat + implements org.apache.hadoop.mapred.InputFormat { + + private static final Logger LOG = LoggerFactory.getLogger(KafkaPullerInputFormat.class); + + @Override public InputSplit[] getSplits(JobConf jobConf, int i) throws IOException { + List inputSplits; + try { + inputSplits = computeSplits(jobConf); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException(e); + } + InputSplit[] inputSplitsArray = new InputSplit[inputSplits.size()]; + return inputSplits.toArray(inputSplitsArray); + } + + /** + * Build a full scan using Kafka list partition then beginning/end offsets. + * This function might block duo to calls like: + * org.apache.kafka.clients.consumer.KafkaConsumer#beginningOffsets(java.util.Collection) + * + * @param topic kafka topic + * @param consumer initialized kafka consumer + * @param tablePaths hive table path + * + * @return full scan input split collection based on Kafka metadata APIs + */ + private static List buildFullScanFromKafka(String topic, + KafkaConsumer consumer, + Path[] tablePaths) { + final Map starOffsetsMap; + final Map endOffsetsMap; + + final List topicPartitions; + topicPartitions = fetchTopicPartitions(topic, consumer); + starOffsetsMap = consumer.beginningOffsets(topicPartitions); + endOffsetsMap = consumer.endOffsets(topicPartitions); + + if (LOG.isDebugEnabled()) { + LOG.info("Found the following partitions [{}]", + topicPartitions.stream().map(TopicPartition::toString).collect(Collectors.joining(","))); + starOffsetsMap.forEach((tp, start) -> LOG.info("TPartition [{}],Start offsets [{}]", tp, start)); + endOffsetsMap.forEach((tp, end) -> LOG.info("TPartition [{}],End offsets [{}]", tp, end)); + } + return topicPartitions.stream() + .map(topicPartition -> new KafkaPullerInputSplit(topicPartition.topic(), + topicPartition.partition(), + starOffsetsMap.get(topicPartition), + endOffsetsMap.get(topicPartition), + tablePaths[0])) + .collect(Collectors.toList()); + } + + private List computeSplits(Configuration configuration) + throws IOException, InterruptedException { + // this will be used to harness some KAFKA blocking calls + final ExecutorService execService = Executors.newSingleThreadExecutor(); + try (KafkaConsumer consumer = new KafkaConsumer(KafkaStreamingUtils.consumerProperties(configuration))) { + final String topic = configuration.get(KafkaStreamingUtils.HIVE_KAFKA_TOPIC); + final long + timeoutMs = + configuration.getLong(KafkaStreamingUtils.HIVE_KAFKA_POLL_TIMEOUT, + KafkaStreamingUtils.DEFAULT_CONSUMER_POLL_TIMEOUT_MS); + // hive depends on FileSplits + JobConf jobConf = new JobConf(configuration); + Path[] tablePaths = org.apache.hadoop.mapred.FileInputFormat.getInputPaths(jobConf); + + //noinspection unchecked + Future> + futureFullHouse = + execService.submit(() -> buildFullScanFromKafka(topic, consumer, tablePaths)); + List fullHouse; + try { + fullHouse = futureFullHouse.get(timeoutMs, TimeUnit.MILLISECONDS); + } catch (TimeoutException | ExecutionException e) { + futureFullHouse.cancel(true); + LOG.error("can not generate full scan split", e); + // at this point we can not go further fail split generation + throw new IOException(e); + } + + @SuppressWarnings("unchecked") final ImmutableMap.Builder + fullHouseMapBuilder = + new ImmutableMap.Builder(); + fullHouse.forEach(input -> fullHouseMapBuilder.put(new TopicPartition(input.getTopic(), input.getPartition()), + input)); + + final KafkaScanTrimmer kafkaScanTrimmer = new KafkaScanTrimmer(fullHouseMapBuilder.build(), consumer); + final String filterExprSerialized = configuration.get(TableScanDesc.FILTER_EXPR_CONF_STR); + + if (filterExprSerialized != null && !filterExprSerialized.isEmpty()) { + ExprNodeGenericFuncDesc filterExpr = SerializationUtilities.deserializeExpression(filterExprSerialized); + LOG.info("Kafka trimmer working on Filter tree {}", filterExpr.getExprString()); + Callable> + trimmerWorker = () -> kafkaScanTrimmer.computeOptimizedScan(filterExpr) + .entrySet() + .stream() + .map(Map.Entry::getValue) + .collect(Collectors.toList()); + + Future> futureTinyHouse = execService.submit(trimmerWorker); + try { + return futureTinyHouse.get(timeoutMs, TimeUnit.MILLISECONDS) + .stream() + // filter out empty splits + .filter(split -> split.getStartOffset() < split.getEndOffset()) + .collect(Collectors.toList()); + } catch (ExecutionException | TimeoutException e) { + futureTinyHouse.cancel(true); + LOG.error("Had issue with trimmer will return full scan ", e); + return fullHouse; + } + } + //Case null: it can be filter evaluated to false or no filter at all thus return full scan + return fullHouse; + } finally { + execService.shutdown(); + } + } + + private static List fetchTopicPartitions(String topic, KafkaConsumer consumer) { + // this will block till REQUEST_TIMEOUT_MS_CONFIG = "request.timeout.ms" + // then throws org.apache.kafka.common.errors.TimeoutException if can not fetch metadata + // @TODO add retry logic maybe + List partitions = consumer.partitionsFor(topic); + return partitions.stream().map(p -> new TopicPartition(topic, p.partition())).collect(Collectors.toList()); + } + + @Override public RecordReader getRecordReader(InputSplit inputSplit, + JobConf jobConf, + Reporter reporter) { + return new KafkaPullerRecordReader((KafkaPullerInputSplit) inputSplit, jobConf); + } + + @Override public List getSplits(JobContext jobContext) + throws IOException, InterruptedException { + return computeSplits(jobContext.getConfiguration()).stream() + .map(kafkaPullerInputSplit -> (org.apache.hadoop.mapreduce.InputSplit) kafkaPullerInputSplit) + .collect(Collectors.toList()); + } + + @Override public org.apache.hadoop.mapreduce.RecordReader createRecordReader( + org.apache.hadoop.mapreduce.InputSplit inputSplit, + TaskAttemptContext taskAttemptContext) { + return new KafkaPullerRecordReader(); + } +} diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaPullerInputSplit.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaPullerInputSplit.java new file mode 100644 index 0000000000..697469c9e0 --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaPullerInputSplit.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.google.common.base.Objects; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileSplit; + +import javax.annotation.Nullable; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +/** + * Kafka Hadoop Input Split Class. + */ +@SuppressWarnings("WeakerAccess") public class KafkaPullerInputSplit extends FileSplit + implements org.apache.hadoop.mapred.InputSplit { + private String topic; + private long startOffset; + private int partition; + private long endOffset; + + public KafkaPullerInputSplit() { + super(null, 0, 0, (String[]) null); + } + + public KafkaPullerInputSplit(String topic, int partition, long startOffset, long endOffset, Path dummyPath) { + super(dummyPath, 0, 0, (String[]) null); + this.topic = topic; + this.startOffset = startOffset; + this.partition = partition; + this.endOffset = endOffset; + Preconditions.checkArgument(startOffset >= 0 && startOffset <= endOffset, + "start [%s] has to be positive and >= end [%]", + startOffset, + endOffset); + } + + @Override public long getLength() { + return 0; + } + + @Override public String[] getLocations() { + return new String[0]; + } + + @Override public void write(DataOutput dataOutput) throws IOException { + super.write(dataOutput); + dataOutput.writeUTF(topic); + dataOutput.writeInt(partition); + dataOutput.writeLong(startOffset); + dataOutput.writeLong(endOffset); + } + + @Override public void readFields(DataInput dataInput) throws IOException { + super.readFields(dataInput); + topic = dataInput.readUTF(); + partition = dataInput.readInt(); + startOffset = dataInput.readLong(); + endOffset = dataInput.readLong(); + Preconditions.checkArgument(startOffset >= 0 && startOffset <= endOffset, + "start [%s] has to be positive and >= end [%]", + startOffset, + endOffset); + } + + public String getTopic() { + return topic; + } + + public int getPartition() { + return partition; + } + + public long getStartOffset() { + return startOffset; + } + + public long getEndOffset() { + return endOffset; + } + + /** + * Compute the intersection of 2 splits. Splits must share the same topic and partition number. + * + * @param split1 left split + * @param split2 right split + * + * @return new split that represents range intersection or null if it is not overlapping + */ + @Nullable public static KafkaPullerInputSplit intersectRange(KafkaPullerInputSplit split1, + KafkaPullerInputSplit split2) { + assert (split1.topic.equals(split2.topic)); + assert (split1.partition == split2.partition); + final long startOffset = Math.max(split1.getStartOffset(), split2.getStartOffset()); + final long endOffset = Math.min(split1.getEndOffset(), split2.getEndOffset()); + if (startOffset > endOffset) { + // there is no overlapping + return null; + } + return new KafkaPullerInputSplit(split1.topic, split1.partition, startOffset, endOffset, split1.getPath()); + } + + /** + * Compute union of ranges between splits. Splits must share the same topic and partition + * + * @param split1 left split + * @param split2 right split + * + * @return new split with a range including both splits. + */ + public static KafkaPullerInputSplit unionRange(KafkaPullerInputSplit split1, KafkaPullerInputSplit split2) { + assert (split1.topic.equals(split2.topic)); + assert (split1.partition == split2.partition); + final long startOffset = Math.min(split1.getStartOffset(), split2.getStartOffset()); + final long endOffset = Math.max(split1.getEndOffset(), split2.getEndOffset()); + return new KafkaPullerInputSplit(split1.topic, split1.partition, startOffset, endOffset, split1.getPath()); + } + + @Override public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof KafkaPullerInputSplit)) { + return false; + } + KafkaPullerInputSplit that = (KafkaPullerInputSplit) o; + return Objects.equal(getTopic(), that.getTopic()) + && Objects.equal(getStartOffset(), that.getStartOffset()) + && Objects.equal(getPartition(), that.getPartition()) + && Objects.equal(getEndOffset(), that.getEndOffset()); + } + + @Override public int hashCode() { + return Objects.hashCode(getTopic(), getStartOffset(), getPartition(), getEndOffset()); + } + + @Override public String toString() { + return "KafkaPullerInputSplit{" + + "topic='" + + topic + + '\'' + + ", startOffset=" + + startOffset + + ", partition=" + + partition + + ", endOffset=" + + endOffset + + ", path=" + + super.getPath().toString() + + '}'; + } + + public static KafkaPullerInputSplit copyOf(KafkaPullerInputSplit other) { + return new KafkaPullerInputSplit(other.getTopic(), + other.getPartition(), + other.getStartOffset(), + other.getEndOffset(), + other.getPath()); + } + + @SuppressWarnings("MethodDoesntCallSuperMethod") public KafkaPullerInputSplit clone() { + return copyOf(this); + } + + public static List slice(long sliceSize, final KafkaPullerInputSplit split) { + if (split.getEndOffset() - split.getStartOffset() > sliceSize) { + ImmutableList.Builder builder = ImmutableList.builder(); + long start = split.getStartOffset(); + while (start < split.getEndOffset() - sliceSize) { + builder.add(new KafkaPullerInputSplit(split.topic, + split.partition, + start, + start + sliceSize + 1, + split.getPath())); + start += sliceSize + 1; + } + // last split + if (start < split.getEndOffset()) { + builder.add(new KafkaPullerInputSplit(split.topic, + split.partition, + start, + split.getEndOffset(), + split.getPath())); + } + return builder.build(); + } + + return Collections.singletonList(copyOf(split)); + } + +} diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaPullerRecordReader.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaPullerRecordReader.java new file mode 100644 index 0000000000..908ee5e29f --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaPullerRecordReader.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.google.common.base.Preconditions; +import com.google.common.io.Closer; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapreduce.RecordReader; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.kafka.clients.CommonClientConfigs; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.TopicPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Properties; + +/** + * Kafka Records Reader implementation. + */ +@SuppressWarnings("UnstableApiUsage") public class KafkaPullerRecordReader extends RecordReader + implements org.apache.hadoop.mapred.RecordReader { + + private static final Logger LOG = LoggerFactory.getLogger(KafkaPullerRecordReader.class); + + private final Closer closer = Closer.create(); + private KafkaConsumer consumer = null; + private Configuration config = null; + private KafkaRecordWritable currentWritableValue; + private Iterator> recordsCursor = null; + + private long totalNumberRecords = 0L; + private long consumedRecords = 0L; + private long readBytes = 0L; + private volatile boolean started = false; + private long startOffset = -1L; + private long endOffset = Long.MAX_VALUE; + + @SuppressWarnings("WeakerAccess") public KafkaPullerRecordReader() { + } + + private void initConsumer() { + if (consumer == null) { + LOG.info("Initializing Kafka Consumer"); + final Properties properties = KafkaStreamingUtils.consumerProperties(config); + String brokerString = properties.getProperty(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG); + Preconditions.checkNotNull(brokerString, "broker end point can not be null"); + LOG.info("Starting Consumer with Kafka broker string [{}]", brokerString); + consumer = new KafkaConsumer<>(properties); + closer.register(consumer); + } + } + + @SuppressWarnings("WeakerAccess") public KafkaPullerRecordReader(KafkaPullerInputSplit inputSplit, + Configuration jobConf) { + initialize(inputSplit, jobConf); + } + + private synchronized void initialize(KafkaPullerInputSplit inputSplit, Configuration jobConf) { + if (!started) { + this.config = jobConf; + startOffset = inputSplit.getStartOffset(); + endOffset = inputSplit.getEndOffset(); + TopicPartition topicPartition = new TopicPartition(inputSplit.getTopic(), inputSplit.getPartition()); + Preconditions.checkState(startOffset >= 0 && startOffset <= endOffset, + "Start [%s] has to be positive and less or equal than End [%s]", startOffset, endOffset); + totalNumberRecords += endOffset - startOffset; + initConsumer(); + long + pollTimeout = + config.getLong(KafkaStreamingUtils.HIVE_KAFKA_POLL_TIMEOUT, + KafkaStreamingUtils.DEFAULT_CONSUMER_POLL_TIMEOUT_MS); + LOG.debug("Consumer poll timeout [{}] ms", pollTimeout); + this.recordsCursor = + startOffset == endOffset ? + new KafkaRecordIterator.EmptyIterator() : + new KafkaRecordIterator(consumer, topicPartition, startOffset, endOffset, pollTimeout); + started = true; + } + } + + @Override public void initialize(org.apache.hadoop.mapreduce.InputSplit inputSplit, + TaskAttemptContext context) { + initialize((KafkaPullerInputSplit) inputSplit, context.getConfiguration()); + } + + @Override public boolean next(NullWritable nullWritable, KafkaRecordWritable bytesWritable) { + if (started && recordsCursor.hasNext()) { + ConsumerRecord record = recordsCursor.next(); + bytesWritable.set(record, startOffset, endOffset); + consumedRecords += 1; + readBytes += record.serializedValueSize(); + return true; + } + return false; + } + + @Override public NullWritable createKey() { + return NullWritable.get(); + } + + @Override public KafkaRecordWritable createValue() { + return new KafkaRecordWritable(); + } + + @Override public long getPos() { + return -1; + } + + @Override public boolean nextKeyValue() { + currentWritableValue = new KafkaRecordWritable(); + if (next(NullWritable.get(), currentWritableValue)) { + return true; + } + currentWritableValue = null; + return false; + } + + @Override public NullWritable getCurrentKey() { + return NullWritable.get(); + } + + @Override public KafkaRecordWritable getCurrentValue() { + return Preconditions.checkNotNull(currentWritableValue); + } + + @Override public float getProgress() { + if (consumedRecords == 0) { + return 0f; + } + if (consumedRecords >= totalNumberRecords) { + return 1f; + } + return consumedRecords * 1.0f / totalNumberRecords; + } + + @Override public void close() throws IOException { + LOG.trace("total read bytes [{}]", readBytes); + if (consumer != null) { + consumer.wakeup(); + } + closer.close(); + } +} diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaRecordIterator.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaRecordIterator.java new file mode 100644 index 0000000000..7daa3e2544 --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaRecordIterator.java @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.google.common.base.Preconditions; +import com.google.common.base.Stopwatch; +import com.google.common.collect.ImmutableList; +import org.apache.kafka.clients.consumer.Consumer; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.errors.TimeoutException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * Iterator over Kafka Records to read records from a single topic partition inclusive start exclusive end. + *

+ * If {@code startOffset} is not null will seek up to that offset + * Else If {@code startOffset} is null will seek to beginning see + * {@link org.apache.kafka.clients.consumer.Consumer#seekToBeginning(java.util.Collection)} + *

+ * When provided with an end offset it will return records up to the record with offset == endOffset - 1, + * Else If end offsets is null it will read up to the current end see + * {@link org.apache.kafka.clients.consumer.Consumer#endOffsets(java.util.Collection)} + *

+ * Current implementation of this Iterator will throw and exception if can not poll up to the endOffset - 1 + */ +public class KafkaRecordIterator implements Iterator> { + private static final Logger LOG = LoggerFactory.getLogger(KafkaRecordIterator.class); + + private final Consumer consumer; + private final TopicPartition topicPartition; + private long endOffset; + private long startOffset; + private final long pollTimeoutMs; + private final Stopwatch stopwatch = Stopwatch.createUnstarted(); + private ConsumerRecords records; + private long currentOffset; + private ConsumerRecord nextRecord; + private boolean hasMore = true; + private final boolean started; + + //Kafka consumer poll method return an iterator of records. + private Iterator> consumerRecordIterator = null; + + /** + * @param consumer functional kafka consumer + * @param topicPartition kafka topic partition + * @param startOffset start position of stream. + * @param endOffset requested end position. If null will read up to current last + * @param pollTimeoutMs poll time out in ms + */ + KafkaRecordIterator(Consumer consumer, + TopicPartition topicPartition, + @Nullable Long startOffset, + @Nullable Long endOffset, + long pollTimeoutMs) { + this.consumer = Preconditions.checkNotNull(consumer, "Consumer can not be null"); + this.topicPartition = Preconditions.checkNotNull(topicPartition, "Topic partition can not be null"); + this.pollTimeoutMs = pollTimeoutMs; + Preconditions.checkState(this.pollTimeoutMs > 0, "poll timeout has to be positive number"); + this.startOffset = startOffset == null ? -1L : startOffset; + this.endOffset = endOffset == null ? -1L : endOffset; + assignAndSeek(); + this.started = true; + } + + KafkaRecordIterator(Consumer consumer, TopicPartition tp, long pollTimeoutMs) { + this(consumer, tp, null, null, pollTimeoutMs); + } + + private void assignAndSeek() { + // assign topic partition to consumer + final List topicPartitionList = ImmutableList.of(topicPartition); + if (LOG.isTraceEnabled()) { + stopwatch.reset().start(); + } + + consumer.assign(topicPartitionList); + // compute offsets and seek to start + if (startOffset > -1) { + LOG.info("Seeking to offset [{}] of topic partition [{}]", startOffset, topicPartition); + consumer.seek(topicPartition, startOffset); + } else { + LOG.info("Seeking to beginning of topic partition [{}]", topicPartition); + // seekToBeginning is lazy thus need to call position() or poll(0) + this.consumer.seekToBeginning(Collections.singleton(topicPartition)); + startOffset = consumer.position(topicPartition); + } + if (endOffset == -1) { + this.endOffset = consumer.endOffsets(topicPartitionList).get(topicPartition); + LOG.info("EndOffset set to {}", endOffset); + } + currentOffset = consumer.position(topicPartition); + Preconditions.checkState(this.endOffset >= currentOffset, + "End offset [%s] need to be greater than start offset [%s]", + this.endOffset, + currentOffset); + LOG.info("Kafka Iterator ready, assigned TopicPartition [{}]; startOffset [{}]; endOffset [{}]", + topicPartition, + currentOffset, + this.endOffset); + if (LOG.isTraceEnabled()) { + stopwatch.stop(); + LOG.trace("Time to assign and seek [{}] ms", stopwatch.elapsed(TimeUnit.MILLISECONDS)); + } + } + + @Override + public boolean hasNext() { + /* + Poll more records from Kafka queue IF: + Initial poll case -> (records == null) + OR + Need to poll at least one more record (currentOffset + 1 < endOffset) AND consumerRecordIterator is empty (!hasMore) + */ + if (!hasMore && currentOffset + 1 < endOffset || records == null) { + pollRecords(); + findNext(); + } + return hasMore; + } + + /** + * Poll more records or Fail with {@link TimeoutException} if no records returned before reaching target end offset. + */ + private void pollRecords() { + if (LOG.isTraceEnabled()) { + stopwatch.reset().start(); + } + Preconditions.checkArgument(started); + records = consumer.poll(pollTimeoutMs); + if (LOG.isTraceEnabled()) { + stopwatch.stop(); + LOG.trace("Pulled [{}] records in [{}] ms", records.count(), stopwatch.elapsed(TimeUnit.MILLISECONDS)); + } + // Fail if we can not poll within one lap of pollTimeoutMs. + if (records.isEmpty() && currentOffset < endOffset) { + throw new TimeoutException(String.format("Current offset: [%s]-TopicPartition:[%s], target End offset:[%s]." + + "Consumer returned 0 record due to exhausted poll timeout [%s]ms, try increasing[%s]", + currentOffset, + topicPartition.toString(), + endOffset, + pollTimeoutMs, + KafkaStreamingUtils.HIVE_KAFKA_POLL_TIMEOUT)); + } + consumerRecordIterator = records.iterator(); + } + + @Override public ConsumerRecord next() { + ConsumerRecord value = nextRecord; + Preconditions.checkState(value.offset() < endOffset); + findNext(); + return Preconditions.checkNotNull(value); + } + + /** + * Find the next element in the batch of returned records by previous poll or set hasMore to false tp poll more next + * call to {@link KafkaRecordIterator#hasNext()}. + */ + private void findNext() { + if (consumerRecordIterator.hasNext()) { + nextRecord = consumerRecordIterator.next(); + hasMore = true; + if (nextRecord.offset() < endOffset) { + currentOffset = nextRecord.offset(); + return; + } + } + hasMore = false; + nextRecord = null; + } + + /** + * Empty iterator for empty splits when startOffset == endOffset, this is added to avoid clumsy if condition. + */ + protected static final class EmptyIterator implements Iterator> { + @Override public boolean hasNext() { + return false; + } + + @Override public ConsumerRecord next() { + throw new IllegalStateException("this is an empty iterator"); + } + } +} diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaRecordWritable.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaRecordWritable.java new file mode 100644 index 0000000000..c6924ea480 --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaRecordWritable.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import org.apache.hadoop.io.Writable; +import org.apache.kafka.clients.consumer.ConsumerRecord; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Arrays; +import java.util.Objects; + +/** + * Writable implementation of Kafka ConsumerRecord. + * Serialized in the form + * {@code timestamp} long| {@code partition} (int) | {@code offset} (long) | + * {@code startOffset} (long) | {@code endOffset} (long) | {@code value.size()} (int) | {@code value} (byte []) + */ +public class KafkaRecordWritable implements Writable { + + /** + * Kafka partition id + */ + private int partition; + /** + * Record Offset + */ + private long offset; + /** + * Fist offset given by the input split used to pull the event {@link KafkaPullerInputSplit#getStartOffset()} + */ + private long startOffset; + /** + * Last Offset given by the input split used to pull the event {@link KafkaPullerInputSplit#getEndOffset()} + */ + private long endOffset; + /** + * Event timestamp provided by Kafka Record {@link ConsumerRecord#timestamp()} + */ + private long timestamp; + /** + * Record value + */ + private byte[] value; + + void set(ConsumerRecord consumerRecord, long startOffset, long endOffset) { + this.partition = consumerRecord.partition(); + this.timestamp = consumerRecord.timestamp(); + this.offset = consumerRecord.offset(); + this.value = consumerRecord.value(); + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + KafkaRecordWritable(int partition, + long offset, + long timestamp, + byte[] value, + long startOffset, + long endOffset) { + this.partition = partition; + this.offset = offset; + this.timestamp = timestamp; + this.value = value; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + @SuppressWarnings("WeakerAccess") public KafkaRecordWritable() { + } + + @Override public void write(DataOutput dataOutput) throws IOException { + dataOutput.writeLong(timestamp); + dataOutput.writeInt(partition); + dataOutput.writeLong(offset); + dataOutput.writeLong(startOffset); + dataOutput.writeLong(endOffset); + dataOutput.writeInt(value.length); + dataOutput.write(value); + } + + @Override public void readFields(DataInput dataInput) throws IOException { + timestamp = dataInput.readLong(); + partition = dataInput.readInt(); + offset = dataInput.readLong(); + startOffset = dataInput.readLong(); + endOffset = dataInput.readLong(); + int size = dataInput.readInt(); + if (size > 0) { + value = new byte[size]; + dataInput.readFully(value); + } else { + value = new byte[0]; + } + } + + int getPartition() { + return partition; + } + + long getOffset() { + return offset; + } + + long getTimestamp() { + return timestamp; + } + + byte[] getValue() { + return value; + } + + long getStartOffset() { + return startOffset; + } + + long getEndOffset() { + return endOffset; + } + + @Override public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof KafkaRecordWritable)) { + return false; + } + KafkaRecordWritable writable = (KafkaRecordWritable) o; + return partition == writable.partition + && offset == writable.offset + && startOffset == writable.startOffset + && endOffset == writable.endOffset + && timestamp == writable.timestamp + && Arrays.equals(value, writable.value); + } + + @Override public int hashCode() { + int result = Objects.hash(partition, offset, startOffset, endOffset, timestamp); + result = 31 * result + Arrays.hashCode(value); + return result; + } + + @Override public String toString() { + return "KafkaRecordWritable{" + + "partition=" + + partition + + ", offset=" + + offset + + ", startOffset=" + + startOffset + + ", endOffset=" + + endOffset + + ", timestamp=" + + timestamp + + ", value=" + + Arrays.toString(value) + + '}'; + } +} diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaScanTrimmer.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaScanTrimmer.java new file mode 100644 index 0000000000..76415151ec --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaScanTrimmer.java @@ -0,0 +1,482 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; +import org.apache.hadoop.hive.ql.exec.FunctionRegistry; +import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToBinary; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToChar; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDate; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDecimal; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUtcTimestamp; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToVarchar; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.clients.consumer.OffsetAndTimestamp; +import org.apache.kafka.common.TopicPartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.Predicate; + +/** + * Kafka Range trimmer, takes a full kafka scan and prune the scan based on a filter expression + * it is a Best effort trimmer and it can not replace the filter it self, filtration still takes place in Hive executor. + */ +class KafkaScanTrimmer { + private static final Logger LOG = LoggerFactory.getLogger(KafkaScanTrimmer.class); + private final Map fullHouse; + private final KafkaConsumer kafkaConsumer; + + /** + * @param fullHouse initial full scan to be pruned, this is a map of Topic partition to input split. + * @param kafkaConsumer kafka consumer used to pull offsets for time filter if needed + */ + KafkaScanTrimmer(Map fullHouse, KafkaConsumer kafkaConsumer) { + this.fullHouse = fullHouse; + this.kafkaConsumer = kafkaConsumer; + } + + /** + * This might block due to calls like. + * org.apache.kafka.clients.consumer.KafkaConsumer#offsetsForTimes(java.util.Map) + * + * @param filterExpression filter expression to be used for pruning scan + * + * @return tiny house of of the full house based on filter expression + */ + Map computeOptimizedScan(ExprNodeGenericFuncDesc filterExpression) { + Map optimizedScan = parseAndOptimize(filterExpression); + + if (LOG.isDebugEnabled()) { + if (optimizedScan != null) { + LOG.debug("Optimized scan:"); + optimizedScan.forEach((tp, input) -> LOG.info( + "Topic-[{}] Partition-[{}] - Split startOffset [{}] :-> endOffset [{}]", + tp.topic(), + tp.partition(), + input.getStartOffset(), + input.getEndOffset())); + } else { + LOG.debug("No optimization thus using full scan "); + fullHouse.forEach((tp, input) -> LOG.info( + "Topic-[{}] Partition-[{}] - Split startOffset [{}] :-> endOffset [{}]", + tp.topic(), + tp.partition(), + input.getStartOffset(), + input.getEndOffset())); + } + } + return optimizedScan == null ? fullHouse : optimizedScan; + } + + /** + * @param expression filter to parseAndOptimize and trim the full scan + * + * @return Map of optimized kafka range scans or null if it is impossible to optimize. + */ + @Nullable private Map parseAndOptimize(ExprNodeDesc expression) { + if (expression.getClass() != ExprNodeGenericFuncDesc.class) { + return null; + } + // get the kind of expression + ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc) expression; + Class op = expr.getGenericUDF().getClass(); + + // handle the logical operators + if (FunctionRegistry.isOpOr(expr)) { + return pushOrOp(expr); + } + if (FunctionRegistry.isOpAnd(expr)) { + return pushAndOp(expr); + } + + if (op == GenericUDFOPGreaterThan.class) { + return pushLeaf(expr, PredicateLeaf.Operator.LESS_THAN_EQUALS, true); + } else if (op == GenericUDFOPEqualOrGreaterThan.class) { + return pushLeaf(expr, PredicateLeaf.Operator.LESS_THAN, true); + } else if (op == GenericUDFOPLessThan.class) { + return pushLeaf(expr, PredicateLeaf.Operator.LESS_THAN, false); + } else if (op == GenericUDFOPEqualOrLessThan.class) { + return pushLeaf(expr, PredicateLeaf.Operator.LESS_THAN_EQUALS, false); + } else if (op == GenericUDFOPEqual.class) { + return pushLeaf(expr, PredicateLeaf.Operator.EQUALS, false); + // otherwise, we didn't understand it, so bailout + } else { + return null; + } + } + + /** + * @param expr leaf node to push + * @param operator operator + * @param negation true if it is a negation, this is used to represent: + * GenericUDFOPGreaterThan and GenericUDFOPEqualOrGreaterThan + * using PredicateLeaf.Operator.LESS_THAN and PredicateLeaf.Operator.LESS_THAN_EQUALS + * + * @return leaf scan or null if can not figure out push down + */ + @Nullable private Map pushLeaf(ExprNodeGenericFuncDesc expr, + PredicateLeaf.Operator operator, + boolean negation) { + if (expr.getChildren().size() != 2) { + return null; + } + GenericUDF genericUDF = expr.getGenericUDF(); + if (!(genericUDF instanceof GenericUDFBaseCompare)) { + return null; + } + ExprNodeDesc expr1 = expr.getChildren().get(0); + ExprNodeDesc expr2 = expr.getChildren().get(1); + // We may need to peel off the GenericUDFBridge that is added by CBO or user + if (expr1.getTypeInfo().equals(expr2.getTypeInfo())) { + expr1 = getColumnExpr(expr1); + expr2 = getColumnExpr(expr2); + } + + ExprNodeDesc[] extracted = ExprNodeDescUtils.extractComparePair(expr1, expr2); + if (extracted == null || (extracted.length > 2)) { + return null; + } + + ExprNodeColumnDesc columnDesc; + ExprNodeConstantDesc constantDesc; + final boolean flip; + + if (extracted[0] instanceof ExprNodeColumnDesc) { + columnDesc = (ExprNodeColumnDesc) extracted[0]; + constantDesc = (ExprNodeConstantDesc) extracted[1]; + flip = false; + + } else { + flip = true; + columnDesc = (ExprNodeColumnDesc) extracted[1]; + constantDesc = (ExprNodeConstantDesc) extracted[0]; + } + + + if (columnDesc.getColumn().equals(KafkaStreamingUtils.PARTITION_COLUMN)) { + return buildScanFromPartitionPredicate(fullHouse, + operator, + ((Number) constantDesc.getValue()).intValue(), + flip, + negation); + + } + if (columnDesc.getColumn().equals(KafkaStreamingUtils.OFFSET_COLUMN)) { + return buildScanFromOffsetPredicate(fullHouse, + operator, + ((Number) constantDesc.getValue()).longValue(), + flip, + negation); + } + + if (columnDesc.getColumn().equals(KafkaStreamingUtils.TIMESTAMP_COLUMN)) { + long timestamp = ((Number) constantDesc.getValue()).longValue(); + //noinspection unchecked + return buildScanForTimesPredicate(fullHouse, operator, timestamp, flip, negation, kafkaConsumer); + } + return null; + } + + /** + * Trim kafka scan using a leaf binary predicate on partition column. + * + * @param fullScan kafka full scan to be optimized + * @param operator predicate operator, equal, lessThan or lessThanEqual + * @param partitionConst partition constant value + * @param flip true if the position of column and constant is flipped by default assuming column OP constant + * @param negation true if the expression is a negation of the original expression + * + * @return filtered kafka scan + */ + + @VisibleForTesting static Map buildScanFromPartitionPredicate( + Map fullScan, + PredicateLeaf.Operator operator, + int partitionConst, + boolean flip, + boolean negation) { + final Predicate predicate; + final Predicate intermediatePredicate; + switch (operator) { + case EQUALS: + predicate = topicPartition -> topicPartition != null && topicPartition.partition() == partitionConst; + break; + case LESS_THAN: + intermediatePredicate = + flip ? + topicPartition -> topicPartition != null && partitionConst < topicPartition.partition() : + topicPartition -> topicPartition != null && topicPartition.partition() < partitionConst; + + predicate = negation ? intermediatePredicate.negate() : intermediatePredicate; + break; + case LESS_THAN_EQUALS: + intermediatePredicate = + flip ? + topicPartition -> topicPartition != null && partitionConst <= topicPartition.partition() : + topicPartition -> topicPartition != null && topicPartition.partition() <= partitionConst; + + predicate = negation ? intermediatePredicate.negate() : intermediatePredicate; + break; + default: + //Default to select * for unknown cases + predicate = topicPartition -> true; + } + + ImmutableMap.Builder builder = ImmutableMap.builder(); + // Filter full scan based on predicate + fullScan.entrySet() + .stream() + .filter(entry -> predicate.test(entry.getKey())) + .forEach(entry -> builder.put(entry.getKey(), entry.getValue().clone())); + return builder.build(); + } + + /** + * @param fullScan full kafka scan to be pruned + * @param operator operator kind + * @param offsetConst offset constant value + * @param flip true if position of constant and column were flipped by default assuming COLUMN OP CONSTANT + * @param negation true if the expression is a negation of the original expression + * + * @return optimized kafka scan + */ + @VisibleForTesting static Map buildScanFromOffsetPredicate(Map fullScan, + PredicateLeaf.Operator operator, + long offsetConst, + boolean flip, + boolean negation) { + final boolean isEndBound; + final long startOffset; + final long endOffset; + + isEndBound = flip == negation; + switch (operator) { + case LESS_THAN_EQUALS: + if (isEndBound) { + startOffset = -1; + endOffset = negation ? offsetConst : offsetConst + 1; + } else { + endOffset = -1; + startOffset = negation ? offsetConst + 1 : offsetConst; + } + break; + case EQUALS: + startOffset = offsetConst; + endOffset = offsetConst + 1; + break; + case LESS_THAN: + if (isEndBound) { + endOffset = negation ? offsetConst + 1 : offsetConst; + startOffset = -1; + } else { + endOffset = -1; + startOffset = negation ? offsetConst : offsetConst + 1; + } + break; + default: + // default to select * + startOffset = -1; + endOffset = -1; + } + + final Map newScan = new HashMap<>(); + + fullScan.forEach((tp, existingInputSplit) -> { + final KafkaPullerInputSplit newInputSplit; + if (startOffset != -1 && endOffset == -1) { + newInputSplit = new KafkaPullerInputSplit(tp.topic(), + tp.partition(), + //if the user ask for start offset > max offset will replace with last offset + Math.min(startOffset, existingInputSplit.getEndOffset()), + existingInputSplit.getEndOffset(), + existingInputSplit.getPath()); + } else if (endOffset != -1 && startOffset == -1) { + newInputSplit = new KafkaPullerInputSplit(tp.topic(), tp.partition(), existingInputSplit.getStartOffset(), + //@TODO check this, if user ask for non existing end offset ignore it and position head on start + // This can be an issue when doing ingestion from kafka into Hive, what happen if there is some gaps + // Shall we fail the ingest or carry-on and ignore non existing offsets + Math.max(endOffset, existingInputSplit.getStartOffset()), existingInputSplit.getPath()); + } else if (endOffset == startOffset + 1) { + if (startOffset < existingInputSplit.getStartOffset() || startOffset >= existingInputSplit.getEndOffset()) { + newInputSplit = new KafkaPullerInputSplit(tp.topic(), tp.partition(), + // non existing offset will be seeking last offset + existingInputSplit.getEndOffset(), existingInputSplit.getEndOffset(), existingInputSplit.getPath()); + } else { + newInputSplit = + new KafkaPullerInputSplit(tp.topic(), + tp.partition(), + startOffset, + endOffset, + existingInputSplit.getPath()); + } + + } else { + newInputSplit = + new KafkaPullerInputSplit(tp.topic(), + tp.partition(), + existingInputSplit.getStartOffset(), + existingInputSplit.getEndOffset(), + existingInputSplit.getPath()); + } + + newScan.put(tp, KafkaPullerInputSplit.intersectRange(newInputSplit, existingInputSplit)); + }); + + return newScan; + } + + @Nullable private static Map buildScanForTimesPredicate( + Map fullHouse, + PredicateLeaf.Operator operator, + long timestamp, + boolean flip, + boolean negation, + KafkaConsumer consumer) { + long + increment = + (flip && operator == PredicateLeaf.Operator.LESS_THAN + || negation && operator == PredicateLeaf.Operator.LESS_THAN_EQUALS) ? 1L : 0L; + // only accepted cases are timestamp_column [ > ; >= ; = ]constant + if (operator == PredicateLeaf.Operator.EQUALS || flip ^ negation) { + final Map timePartitionsMap = Maps.toMap(fullHouse.keySet(), tp -> timestamp + increment); + try { + // Based on Kafka docs + // NULL will be returned for that partition If the message format version in a partition is before 0.10.0 + Map offsetAndTimestamp = consumer.offsetsForTimes(timePartitionsMap); + return Maps.toMap(fullHouse.keySet(), tp -> { + KafkaPullerInputSplit existing = fullHouse.get(tp); + OffsetAndTimestamp foundOffsetAndTime = offsetAndTimestamp.get(tp); + //Null in case filter doesn't match or field not existing ie old broker thus return empty scan. + final long startOffset = foundOffsetAndTime == null ? existing.getEndOffset() : foundOffsetAndTime.offset(); + return new KafkaPullerInputSplit(Objects.requireNonNull(tp).topic(), + tp.partition(), + startOffset, + existing.getEndOffset(), + existing.getPath()); + }); + } catch (Exception e) { + LOG.error("Error while looking up offsets for time", e); + //Bailout when can not figure out offsets for times. + return null; + } + + } + return null; + } + + /** + * @param expr And expression to be parsed + * + * @return either full scan or an optimized sub scan. + */ + private Map pushAndOp(ExprNodeGenericFuncDesc expr) { + Map currentScan = new HashMap<>(); + + fullHouse.forEach((tp, input) -> currentScan.put(tp, KafkaPullerInputSplit.copyOf(input))); + + for (ExprNodeDesc child : expr.getChildren()) { + Map scan = parseAndOptimize(child); + if (scan != null) { + Set currentKeys = ImmutableSet.copyOf(currentScan.keySet()); + currentKeys.forEach(key -> { + KafkaPullerInputSplit newSplit = scan.get(key); + KafkaPullerInputSplit oldSplit = currentScan.get(key); + currentScan.remove(key); + if (newSplit != null) { + KafkaPullerInputSplit intersectionSplit = KafkaPullerInputSplit.intersectRange(newSplit, oldSplit); + if (intersectionSplit != null) { + currentScan.put(key, intersectionSplit); + } + } + }); + + } + } + return currentScan; + } + + @Nullable private Map pushOrOp(ExprNodeGenericFuncDesc expr) { + final Map currentScan = new HashMap<>(); + for (ExprNodeDesc child : expr.getChildren()) { + Map scan = parseAndOptimize(child); + if (scan == null) { + // if any of the children is unknown bailout + return null; + } + + scan.forEach((tp, input) -> { + KafkaPullerInputSplit existingSplit = currentScan.get(tp); + currentScan.put(tp, KafkaPullerInputSplit.unionRange(input, existingSplit == null ? input : existingSplit)); + }); + } + return currentScan; + } + + @SuppressWarnings("Duplicates") private static ExprNodeDesc getColumnExpr(ExprNodeDesc expr) { + if (expr instanceof ExprNodeColumnDesc) { + return expr; + } + ExprNodeGenericFuncDesc funcDesc = null; + if (expr instanceof ExprNodeGenericFuncDesc) { + funcDesc = (ExprNodeGenericFuncDesc) expr; + } + if (null == funcDesc) { + return expr; + } + GenericUDF udf = funcDesc.getGenericUDF(); + // check if its a simple cast expression. + if ((udf instanceof GenericUDFBridge + || udf instanceof GenericUDFToBinary + || udf instanceof GenericUDFToChar + || udf instanceof GenericUDFToVarchar + || udf instanceof GenericUDFToDecimal + || udf instanceof GenericUDFToDate + || udf instanceof GenericUDFToUnixTimeStamp + || udf instanceof GenericUDFToUtcTimestamp) && funcDesc.getChildren().size() == 1 && funcDesc.getChildren() + .get(0) instanceof ExprNodeColumnDesc) { + return expr.getChildren().get(0); + } + return expr; + } + +} diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaStorageHandler.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaStorageHandler.java new file mode 100644 index 0000000000..5847df5e7e --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaStorageHandler.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.google.common.base.Preconditions; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.metastore.HiveMetaHook; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; +import org.apache.hadoop.hive.ql.plan.TableDesc; +import org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider; +import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider; +import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputFormat; +import org.apache.hadoop.mapred.lib.NullOutputFormat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +/** + * Hive Kafka storage handler to allow user querying Stream of tuples from a Kafka queue. + */ +public class KafkaStorageHandler implements HiveStorageHandler { + + private static final Logger LOG = LoggerFactory.getLogger(KafkaStorageHandler.class); + + Configuration configuration; + + @Override public Class getInputFormatClass() { + return KafkaPullerInputFormat.class; + } + + @Override public Class getOutputFormatClass() { + return NullOutputFormat.class; + } + + @Override public Class getSerDeClass() { + return GenericKafkaSerDe.class; + } + + @Override public HiveMetaHook getMetaHook() { + return null; + } + + @Override public HiveAuthorizationProvider getAuthorizationProvider() throws HiveException { + return new DefaultHiveAuthorizationProvider(); + } + + @Override public void configureInputJobProperties(TableDesc tableDesc, Map jobProperties) { + jobProperties.put(KafkaStreamingUtils.HIVE_KAFKA_TOPIC, + Preconditions.checkNotNull(tableDesc.getProperties().getProperty(KafkaStreamingUtils.HIVE_KAFKA_TOPIC), + "kafka topic missing set table property->" + KafkaStreamingUtils.HIVE_KAFKA_TOPIC)); + LOG.debug("Table properties: Kafka Topic {}", tableDesc.getProperties().getProperty(KafkaStreamingUtils.HIVE_KAFKA_TOPIC)); + jobProperties.put(KafkaStreamingUtils.HIVE_KAFKA_BOOTSTRAP_SERVERS, + Preconditions.checkNotNull(tableDesc.getProperties().getProperty(KafkaStreamingUtils.HIVE_KAFKA_BOOTSTRAP_SERVERS), + "Broker address missing set table property->" + KafkaStreamingUtils.HIVE_KAFKA_BOOTSTRAP_SERVERS)); + LOG.debug("Table properties: Kafka broker {}", tableDesc.getProperties().getProperty(KafkaStreamingUtils.HIVE_KAFKA_BOOTSTRAP_SERVERS)); + jobProperties.put(KafkaStreamingUtils.SERDE_CLASS_NAME, + tableDesc.getProperties().getProperty(KafkaStreamingUtils.SERDE_CLASS_NAME, KafkaJsonSerDe.class.getName())); + + LOG.debug("Table properties: SerDe class name {}", jobProperties.get(KafkaStreamingUtils.SERDE_CLASS_NAME)); + + //set extra properties + tableDesc.getProperties() + .entrySet() + .stream() + .filter(objectObjectEntry -> objectObjectEntry.getKey() + .toString() + .toLowerCase() + .startsWith(KafkaStreamingUtils.CONSUMER_CONFIGURATION_PREFIX)) + .forEach(entry -> { + String key = entry.getKey().toString().substring(KafkaStreamingUtils.CONSUMER_CONFIGURATION_PREFIX.length() + 1); + if (KafkaStreamingUtils.FORBIDDEN_PROPERTIES.contains(key)) { + throw new IllegalArgumentException("Not suppose to set Kafka Property " + key); + } + String value = entry.getValue().toString(); + jobProperties.put(key, value); + LOG.info("Setting extra job properties: key [{}] -> value [{}]", key, value); + + }); + } + + @Override public void configureInputJobCredentials(TableDesc tableDesc, Map secrets) { + + } + + @Override public void configureOutputJobProperties(TableDesc tableDesc, Map jobProperties) { + + } + + @Override public void configureTableJobProperties(TableDesc tableDesc, Map jobProperties) { + configureInputJobProperties(tableDesc, jobProperties); + } + + @Override public void configureJobConf(TableDesc tableDesc, JobConf jobConf) { + Map properties = new HashMap<>(); + configureInputJobProperties(tableDesc, properties); + properties.forEach((key, value) -> jobConf.set(key, value)); + try { + KafkaStreamingUtils.copyDependencyJars(jobConf, KafkaStorageHandler.class); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override public void setConf(Configuration configuration) { + this.configuration = configuration; + } + + @Override public Configuration getConf() { + return configuration; + } + + @Override public String toString() { + return "org.apache.hadoop.hive.kafka.KafkaStorageHandler"; + } +} diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaStreamingUtils.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaStreamingUtils.java new file mode 100644 index 0000000000..d2d0ebc192 --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/KafkaStreamingUtils.java @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.google.common.collect.ImmutableList; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.serde2.AbstractSerDe; +import org.apache.hadoop.util.StringUtils; +import org.apache.hive.common.util.ReflectionUtil; +import org.apache.kafka.clients.CommonClientConfigs; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Map; +import java.util.Objects; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Constant, Table properties, Utilities class. + */ +final class KafkaStreamingUtils { + + /** + * MANDATORY Table property indicating kafka topic backing the table + */ + static final String HIVE_KAFKA_TOPIC = "kafka.topic"; + /** + * MANDATORY Table property indicating kafka broker(s) connection string. + */ + static final String HIVE_KAFKA_BOOTSTRAP_SERVERS = "kafka.bootstrap.servers"; + /** + * Table property indicating which delegate serde to be used, NOT MANDATORY defaults to {@link KafkaJsonSerDe} + */ + static final String SERDE_CLASS_NAME = "kafka.serde.class"; + /** + * Table property indicating poll/fetch timeout period in millis. + * FYI this is independent from internal Kafka consumer timeouts, defaults to {@DEFAULT_CONSUMER_POLL_TIMEOUT_MS} + */ + static final String HIVE_KAFKA_POLL_TIMEOUT = "hive.kafka.poll.timeout.ms"; + /** + * default poll timeout for fetching metadata and record batch + */ + static final long DEFAULT_CONSUMER_POLL_TIMEOUT_MS = 5000L; // 5 seconds + /** + * Record Timestamp column name, added as extra meta column of type long + */ + static final String TIMESTAMP_COLUMN = "__timestamp"; + /** + * Record Kafka Partition column name added as extra meta column of type int + */ + static final String PARTITION_COLUMN = "__partition"; + /** + * Record offset column name added as extra metadata column to row as long + */ + static final String OFFSET_COLUMN = "__offset"; + + /** + * Start offset given by the input split, this will reflect the actual start of TP or start given by split pruner + */ + static final String START_OFFSET_COLUMN = "__start_offset"; + + /** + * End offset given by input split at run time + */ + static final String END_OFFSET_COLUMN = "__end_offset"; + /** + * Table property prefix used to inject kafka consumer properties, e.g "kafka.consumer.max.poll.records" = "5000" + * this will lead to inject max.poll.records=5000 to the Kafka Consumer. NOT MANDATORY defaults to nothing + */ + static final String CONSUMER_CONFIGURATION_PREFIX = "kafka.consumer"; + + /** + * Set of Kafka properties that the user can not set via DDLs + */ + static final HashSet FORBIDDEN_PROPERTIES = + new HashSet<>(ImmutableList.of(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, + ConsumerConfig.AUTO_OFFSET_RESET_CONFIG)); + + private KafkaStreamingUtils() { + } + + /** + * @param configuration Job configs + * + * @return default consumer properties + */ + static Properties consumerProperties(Configuration configuration) { + final Properties props = new Properties(); + // we are managing the commit offset + props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); + // we are seeking in the stream so no reset + props.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "none"); + String brokerEndPoint = configuration.get(HIVE_KAFKA_BOOTSTRAP_SERVERS); + if (brokerEndPoint == null || brokerEndPoint.isEmpty()) { + throw new IllegalArgumentException("Kafka Broker End Point is missing Please set Config " + + HIVE_KAFKA_BOOTSTRAP_SERVERS); + } + props.setProperty(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, brokerEndPoint); + props.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + props.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class.getName()); + // user can always override stuff + final Map + kafkaProperties = + configuration.getValByRegex("^" + CONSUMER_CONFIGURATION_PREFIX + "\\..*"); + for (Map.Entry entry : kafkaProperties.entrySet()) { + String key = entry.getKey().substring(CONSUMER_CONFIGURATION_PREFIX.length() + 1); + if (FORBIDDEN_PROPERTIES.contains(key)) { + throw new IllegalArgumentException("Not suppose to set Kafka Property " + key); + } + props.setProperty(key, entry.getValue()); + } + return props; + } + + static void copyDependencyJars(Configuration conf, Class... classes) throws IOException { + Set jars = new HashSet<>(); + FileSystem localFs = FileSystem.getLocal(conf); + jars.addAll(conf.getStringCollection("tmpjars")); + jars.addAll(Arrays.stream(classes).filter(Objects::nonNull).map(clazz -> { + String path = Utilities.jarFinderGetJar(clazz); + if (path == null) { + throw new RuntimeException("Could not find jar for class " + clazz + " in order to ship it to the cluster."); + } + try { + if (!localFs.exists(new Path(path))) { + throw new RuntimeException("Could not validate jar file " + path + " for class " + clazz); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + return path; + }).collect(Collectors.toList())); + + if (jars.isEmpty()) { + return; + } + conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[0]))); + } + + static AbstractSerDe createDelegate(String className) { + final Class clazz; + try { + //noinspection unchecked + clazz = (Class) Class.forName(className); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + // we are not setting conf thus null is okay + return ReflectionUtil.newInstance(clazz, null); + } +} diff --git kafka-handler/src/java/org/apache/hadoop/hive/kafka/package-info.java kafka-handler/src/java/org/apache/hadoop/hive/kafka/package-info.java new file mode 100644 index 0000000000..8a0d8fd0b0 --- /dev/null +++ kafka-handler/src/java/org/apache/hadoop/hive/kafka/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Package info file. + */ + +package org.apache.hadoop.hive.kafka; diff --git kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaPullerInputSplitTest.java kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaPullerInputSplitTest.java new file mode 100644 index 0000000000..be26986818 --- /dev/null +++ kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaPullerInputSplitTest.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.DataInputBuffer; +import org.apache.hadoop.io.DataOutputBuffer; +import org.junit.Assert; +import org.junit.Test; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.List; + +/** + * Kafka Hadoop InputSplit Test. + */ +public class KafkaPullerInputSplitTest { + private String topic = "my_topic"; + private KafkaPullerInputSplit expectedInputSplit; + + public KafkaPullerInputSplitTest() { + this.expectedInputSplit = new KafkaPullerInputSplit(this.topic, 1, 50L, 56L, new Path("/tmp")); + } + + @Test public void testWriteRead() throws IOException { + DataOutput output = new DataOutputBuffer(); + this.expectedInputSplit.write(output); + KafkaPullerInputSplit kafkaPullerInputSplit = new KafkaPullerInputSplit(); + DataInput input = new DataInputBuffer(); + ((DataInputBuffer) input).reset(((DataOutputBuffer) output).getData(), 0, ((DataOutputBuffer) output).getLength()); + kafkaPullerInputSplit.readFields(input); + Assert.assertEquals(this.expectedInputSplit, kafkaPullerInputSplit); + } + + @Test public void andRangeOverLapping() { + KafkaPullerInputSplit kafkaPullerInputSplit = new KafkaPullerInputSplit("test-topic", 2, 10, 400, new Path("/tmp")); + + KafkaPullerInputSplit kafkaPullerInputSplit2 = new KafkaPullerInputSplit("test-topic", 2, 3, 200, new Path("/tmp")); + + Assert.assertEquals(new KafkaPullerInputSplit("test-topic", 2, 10, 200, new Path("/tmp")), + KafkaPullerInputSplit.intersectRange(kafkaPullerInputSplit, kafkaPullerInputSplit2)); + + } + + @Test public void andRangeNonOverLapping() { + KafkaPullerInputSplit kafkaPullerInputSplit = new KafkaPullerInputSplit("test-topic", 2, 10, 400, new Path("/tmp")); + + KafkaPullerInputSplit + kafkaPullerInputSplit2 = + new KafkaPullerInputSplit("test-topic", 2, 550, 700, new Path("/tmp")); + + Assert.assertEquals(null, KafkaPullerInputSplit.intersectRange(kafkaPullerInputSplit, kafkaPullerInputSplit2)); + + } + + @Test public void orRange() { + KafkaPullerInputSplit + kafkaPullerInputSplit = + new KafkaPullerInputSplit("test-topic", 2, 300, 400, new Path("/tmp")); + + KafkaPullerInputSplit kafkaPullerInputSplit2 = new KafkaPullerInputSplit("test-topic", 2, 3, 600, new Path("/tmp")); + + Assert.assertEquals(kafkaPullerInputSplit2, + KafkaPullerInputSplit.unionRange(kafkaPullerInputSplit, kafkaPullerInputSplit2)); + + KafkaPullerInputSplit + kafkaPullerInputSplit3 = + new KafkaPullerInputSplit("test-topic", 2, 700, 6000, new Path("/tmp")); + + Assert.assertEquals(new KafkaPullerInputSplit("test-topic", 2, 300, 6000, new Path("/tmp")), + KafkaPullerInputSplit.unionRange(kafkaPullerInputSplit, kafkaPullerInputSplit3)); + } + + @Test public void copyOf() { + KafkaPullerInputSplit + kafkaPullerInputSplit = + new KafkaPullerInputSplit("test-topic", 2, 300, 400, new Path("/tmp")); + + KafkaPullerInputSplit copyOf = KafkaPullerInputSplit.copyOf(kafkaPullerInputSplit); + Assert.assertEquals(kafkaPullerInputSplit, copyOf); + Assert.assertTrue(kafkaPullerInputSplit != copyOf); + } + + @Test public void testClone() { + KafkaPullerInputSplit + kafkaPullerInputSplit = + new KafkaPullerInputSplit("test-topic", 2, 300, 400, new Path("/tmp")); + + KafkaPullerInputSplit clone = kafkaPullerInputSplit.clone(); + Assert.assertEquals(kafkaPullerInputSplit, clone); + Assert.assertTrue(clone != kafkaPullerInputSplit); + + } + + @Test public void testSlice() { + KafkaPullerInputSplit + kafkaPullerInputSplit = + new KafkaPullerInputSplit("test-topic", 2, 300, 400, new Path("/tmp")); + List kafkaPullerInputSplitList = KafkaPullerInputSplit.slice(14, kafkaPullerInputSplit); + Assert.assertEquals(kafkaPullerInputSplitList.stream() + .mapToLong(kafkaPullerInputSplit1 -> kafkaPullerInputSplit1.getEndOffset() + - kafkaPullerInputSplit1.getStartOffset()) + .sum(), kafkaPullerInputSplit.getEndOffset() - kafkaPullerInputSplit.getStartOffset()); + Assert.assertTrue(kafkaPullerInputSplitList.stream() + .filter(kafkaPullerInputSplit1 -> kafkaPullerInputSplit.getStartOffset() + == kafkaPullerInputSplit1.getStartOffset()) + .count() == 1); + Assert.assertTrue(kafkaPullerInputSplitList.stream() + .filter(kafkaPullerInputSplit1 -> kafkaPullerInputSplit.getEndOffset() == kafkaPullerInputSplit1.getEndOffset()) + .count() == 1); + + } +} diff --git kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaRecordIteratorTest.java kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaRecordIteratorTest.java new file mode 100644 index 0000000000..5de51cd00a --- /dev/null +++ kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaRecordIteratorTest.java @@ -0,0 +1,312 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.google.common.collect.ImmutableList; +import kafka.admin.AdminUtils; +import kafka.admin.RackAwareMode; +import kafka.server.KafkaConfig; +import kafka.server.KafkaServer; +import kafka.utils.MockTime; +import kafka.utils.TestUtils; +import kafka.utils.ZKStringSerializer$; +import kafka.utils.ZkUtils; +import kafka.zk.EmbeddedZookeeper; +import org.I0Itec.zkclient.ZkClient; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskAttemptID; +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.errors.TimeoutException; +import org.apache.kafka.common.serialization.ByteArrayDeserializer; +import org.apache.kafka.common.utils.Time; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.util.Iterator; +import java.util.List; +import java.util.Properties; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Kafka Iterator Tests. + */ +public class KafkaRecordIteratorTest { + private static final Logger LOG = LoggerFactory.getLogger(KafkaRecordIteratorTest.class); + private static final int RECORD_NUMBER = 100; + private static final String TOPIC = "my_test_topic"; + private static final TopicPartition TOPIC_PARTITION = new TopicPartition(TOPIC, 0); + public static final byte[] KEY_BYTES = "KEY".getBytes(Charset.forName("UTF-8")); + private static final List> + RECORDS = + IntStream.range(0, RECORD_NUMBER).mapToObj(number -> { + final byte[] value = ("VALUE-" + Integer.toString(number)).getBytes(Charset.forName("UTF-8")); + return new ConsumerRecord<>(TOPIC, 0, (long) number, 0L, null, 0L, 0, 0, KEY_BYTES, value); + }).collect(Collectors.toList()); + public static final long POLL_TIMEOUT_MS = 900L; + private static ZkUtils zkUtils; + private static ZkClient zkClient; + private static KafkaProducer producer; + private static KafkaServer kafkaServer; + private static String zkConnect; + private KafkaConsumer consumer = null; + private KafkaRecordIterator kafkaRecordIterator = null; + private Configuration conf = new Configuration(); + private static EmbeddedZookeeper zkServer; + + public KafkaRecordIteratorTest() { + } + + @BeforeClass public static void setupCluster() throws IOException { + LOG.info("init embedded Zookeeper"); + zkServer = new EmbeddedZookeeper(); + zkConnect = "127.0.0.1:" + zkServer.port(); + zkClient = new ZkClient(zkConnect, 3000, 3000, ZKStringSerializer$.MODULE$); + zkUtils = ZkUtils.apply(zkClient, false); + LOG.info("init kafka broker"); + Properties brokerProps = new Properties(); + brokerProps.setProperty("zookeeper.connect", zkConnect); + brokerProps.setProperty("broker.id", "0"); + brokerProps.setProperty("log.dir", Files.createTempDirectory("kafka-log-dir-").toAbsolutePath().toString()); + brokerProps.setProperty("listeners", "PLAINTEXT://127.0.0.1:9092"); + brokerProps.setProperty("offsets.TOPIC.replication.factor", "1"); + KafkaConfig config = new KafkaConfig(brokerProps); + Time mock = new MockTime(); + kafkaServer = TestUtils.createServer(config, mock); + kafkaServer.startup(); + LOG.info("Creating kafka TOPIC [{}]", TOPIC); + AdminUtils.createTopic(zkUtils, TOPIC, 1, 1, new Properties(), RackAwareMode.Disabled$.MODULE$); + setupProducer(); + sendData(); + } + + @Before public void setUp() { + LOG.info("setting up consumer"); + this.setupConsumer(); + this.kafkaRecordIterator = null; + } + + @Test public void testHasNextAbsoluteStartEnd() { + this.kafkaRecordIterator = new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, 0L, (long) RECORDS.size(), POLL_TIMEOUT_MS); + this.compareIterator(RECORDS, this.kafkaRecordIterator); + } + + @Test public void testHasNextGivenStartEnd() { + this.kafkaRecordIterator = new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, 2L, 4L, POLL_TIMEOUT_MS); + this.compareIterator(RECORDS.stream() + .filter((consumerRecord) -> consumerRecord.offset() >= 2L && consumerRecord.offset() < 4L) + .collect(Collectors.toList()), this.kafkaRecordIterator); + } + + @Test public void testHasNextNoOffsets() { + this.kafkaRecordIterator = new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, POLL_TIMEOUT_MS); + this.compareIterator(RECORDS, this.kafkaRecordIterator); + } + + @Test public void testHasNextLastRecord() { + long startOffset = (long) (RECORDS.size() - 1); + long lastOffset = (long) RECORDS.size(); + this.kafkaRecordIterator = + new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, startOffset, lastOffset, POLL_TIMEOUT_MS); + this.compareIterator(RECORDS.stream() + .filter((consumerRecord) -> consumerRecord.offset() >= startOffset && consumerRecord.offset() < lastOffset) + .collect(Collectors.toList()), this.kafkaRecordIterator); + } + + @Test public void testHasNextFirstRecord() { + this.kafkaRecordIterator = new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, 0L, 1L, POLL_TIMEOUT_MS); + this.compareIterator(RECORDS.stream() + .filter((consumerRecord) -> consumerRecord.offset() >= 0L && consumerRecord.offset() < 1L) + .collect(Collectors.toList()), this.kafkaRecordIterator); + } + + @Test public void testHasNextNoStart() { + this.kafkaRecordIterator = + new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, null, 10L, POLL_TIMEOUT_MS); + this.compareIterator(RECORDS.stream() + .filter((consumerRecord) -> consumerRecord.offset() >= 0L && consumerRecord.offset() < 10L) + .collect(Collectors.toList()), this.kafkaRecordIterator); + } + + @Test public void testHasNextNoEnd() { + long lastOffset = (long) RECORDS.size(); + this.kafkaRecordIterator = + new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, 5L, null, POLL_TIMEOUT_MS); + this.compareIterator(RECORDS.stream() + .filter((consumerRecord) -> consumerRecord.offset() >= 5L && consumerRecord.offset() < lastOffset) + .collect(Collectors.toList()), this.kafkaRecordIterator); + } + + @Test public void testRecordReader() throws IOException { + List + serRecords = + RECORDS.stream() + .map((aRecord) -> new KafkaRecordWritable(aRecord.partition(), + aRecord.offset(), + aRecord.timestamp(), + aRecord.value(), + 50L, + 100L)) + .collect(Collectors.toList()); + KafkaPullerRecordReader recordReader = new KafkaPullerRecordReader(); + TaskAttemptContext context = new TaskAttemptContextImpl(this.conf, new TaskAttemptID()); + recordReader.initialize(new KafkaPullerInputSplit(TOPIC, 0, 50L, 100L, null), context); + + for (int i = 50; i < 100; ++i) { + KafkaRecordWritable record = new KafkaRecordWritable(); + Assert.assertTrue(recordReader.next(null, record)); + Assert.assertEquals(serRecords.get(i), record); + } + + recordReader.close(); + } + + @Test(expected = TimeoutException.class) public void testPullingBeyondLimit() { + this.kafkaRecordIterator = new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, 0L, 101L, POLL_TIMEOUT_MS); + this.compareIterator(RECORDS, this.kafkaRecordIterator); + } + + @Test(expected = IllegalStateException.class) public void testPullingStartGreaterThanEnd() { + this.kafkaRecordIterator = new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, 10L, 1L, POLL_TIMEOUT_MS); + this.compareIterator(RECORDS, this.kafkaRecordIterator); + } + + @Test(expected = TimeoutException.class) public void testPullingFromEmptyTopic() { + this.kafkaRecordIterator = + new KafkaRecordIterator(this.consumer, new TopicPartition("noHere", 0), 0L, 100L, POLL_TIMEOUT_MS); + this.compareIterator(RECORDS, this.kafkaRecordIterator); + } + + @Test(expected = TimeoutException.class) public void testPullingFromEmptyPartition() { + this.kafkaRecordIterator = + new KafkaRecordIterator(this.consumer, new TopicPartition(TOPIC, 1), 0L, 100L, POLL_TIMEOUT_MS); + this.compareIterator(RECORDS, this.kafkaRecordIterator); + } + + @Test public void testStartIsEqualEnd() { + this.kafkaRecordIterator = new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, 10L, 10L, POLL_TIMEOUT_MS); + this.compareIterator(ImmutableList.of(), this.kafkaRecordIterator); + } + + @Test public void testStartIsTheLastOffset() { + this.kafkaRecordIterator = + new KafkaRecordIterator(this.consumer, + TOPIC_PARTITION, + new Long(RECORD_NUMBER), + new Long(RECORD_NUMBER), + POLL_TIMEOUT_MS); + this.compareIterator(ImmutableList.of(), this.kafkaRecordIterator); + } + + @Test public void testStartIsTheFirstOffset() { + this.kafkaRecordIterator = new KafkaRecordIterator(this.consumer, TOPIC_PARTITION, 0L, 0L, POLL_TIMEOUT_MS); + this.compareIterator(ImmutableList.of(), this.kafkaRecordIterator); + } + + private void compareIterator(List> expected, + Iterator> kafkaRecordIterator) { + expected.stream().forEachOrdered((expectedRecord) -> { + Assert.assertTrue("record with offset " + expectedRecord.offset(), kafkaRecordIterator.hasNext()); + ConsumerRecord record = kafkaRecordIterator.next(); + Assert.assertTrue(record.topic().equals(TOPIC)); + Assert.assertTrue(record.partition() == 0); + Assert.assertEquals("Offsets not matching", expectedRecord.offset(), record.offset()); + byte[] binaryExceptedValue = expectedRecord.value(); + byte[] binaryExceptedKey = expectedRecord.key(); + byte[] binaryValue = (byte[]) record.value(); + byte[] binaryKey = (byte[]) record.key(); + Assert.assertArrayEquals(binaryExceptedValue, binaryValue); + Assert.assertArrayEquals(binaryExceptedKey, binaryKey); + }); + Assert.assertFalse(kafkaRecordIterator.hasNext()); + } + + private static void setupProducer() { + LOG.info("Setting up kafka producer"); + Properties producerProps = new Properties(); + producerProps.setProperty("bootstrap.servers", "127.0.0.1:9092"); + producerProps.setProperty("key.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); + producerProps.setProperty("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); + producerProps.setProperty("max.block.ms", "10000"); + producer = new KafkaProducer(producerProps); + LOG.info("kafka producer started"); + } + + private void setupConsumer() { + Properties consumerProps = new Properties(); + consumerProps.setProperty("enable.auto.commit", "false"); + consumerProps.setProperty("auto.offset.reset", "none"); + consumerProps.setProperty("bootstrap.servers", "127.0.0.1:9092"); + this.conf.set("kafka.bootstrap.servers", "127.0.0.1:9092"); + consumerProps.setProperty("key.deserializer", ByteArrayDeserializer.class.getName()); + consumerProps.setProperty("value.deserializer", ByteArrayDeserializer.class.getName()); + consumerProps.setProperty("request.timeout.ms", "3002"); + consumerProps.setProperty("fetch.max.wait.ms", "3001"); + consumerProps.setProperty("session.timeout.ms", "3001"); + consumerProps.setProperty("metadata.max.age.ms", "100"); + this.consumer = new KafkaConsumer(consumerProps); + } + + private static void sendData() { + LOG.info("Sending {} records", RECORD_NUMBER); + RECORDS.stream() + .map(consumerRecord -> new ProducerRecord(consumerRecord.topic(), + consumerRecord.partition(), + consumerRecord.timestamp(), + consumerRecord.key(), + consumerRecord.value())) + .forEach(producerRecord -> producer.send(producerRecord)); + producer.close(); + } + + @After public void tearDown() { + this.kafkaRecordIterator = null; + if (this.consumer != null) { + this.consumer.close(); + } + } + + @AfterClass public static void tearDownCluster() { + if (kafkaServer != null) { + kafkaServer.shutdown(); + kafkaServer.zkUtils().close(); + kafkaServer.awaitShutdown(); + } + zkServer.shutdown(); + zkClient.close(); + zkUtils.close(); + } +} diff --git kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaRecordWritableTest.java kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaRecordWritableTest.java new file mode 100644 index 0000000000..8f9df548f7 --- /dev/null +++ kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaRecordWritableTest.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.junit.Assert; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +/** + * Test class for kafka Writable. + */ +public class KafkaRecordWritableTest { + public KafkaRecordWritableTest() { + } + + @Test public void testWriteReadFields() throws IOException { + ConsumerRecord record = new ConsumerRecord("topic", 0, 3L, "key".getBytes(), "value".getBytes()); + KafkaRecordWritable kafkaRecordWritable = new KafkaRecordWritable(record.partition(), record.offset(), record.timestamp(), record.value(), 0L, 100L); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DataOutputStream w = new DataOutputStream(baos); + kafkaRecordWritable.write(w); + w.flush(); + + ByteArrayInputStream input = new ByteArrayInputStream(baos.toByteArray()); + DataInputStream inputStream = new DataInputStream(input); + KafkaRecordWritable actualKafkaRecordWritable = new KafkaRecordWritable(); + actualKafkaRecordWritable.readFields(inputStream); + Assert.assertEquals(kafkaRecordWritable, actualKafkaRecordWritable); + } +} diff --git kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaScanTrimmerTest.java kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaScanTrimmerTest.java new file mode 100644 index 0000000000..289dafde36 --- /dev/null +++ kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaScanTrimmerTest.java @@ -0,0 +1,569 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.SerializationUtilities; +import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.kafka.common.TopicPartition; +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import static org.junit.Assert.assertNotNull; + +/** + * Test Class for Kafka Trimmer Class. + */ +public class KafkaScanTrimmerTest { + private static final Path PATH = new Path("/tmp"); + + private ExprNodeDesc zeroInt = ConstantExprBuilder.build(0); + private ExprNodeDesc threeInt = ConstantExprBuilder.build(3); + private ExprNodeDesc thirtyLong = ConstantExprBuilder.build(30L); + private ExprNodeDesc thirtyFiveLong = ConstantExprBuilder.build(35L); + private ExprNodeDesc seventyFiveLong = ConstantExprBuilder.build(75L); + private ExprNodeDesc fortyLong = ConstantExprBuilder.build(40L); + + private ExprNodeDesc + partitionColumn = + new ExprNodeColumnDesc(TypeInfoFactory.intTypeInfo, KafkaStreamingUtils.PARTITION_COLUMN, null, false); + private ExprNodeDesc + offsetColumn = + new ExprNodeColumnDesc(TypeInfoFactory.longTypeInfo, KafkaStreamingUtils.OFFSET_COLUMN, null, false); + + private String topic = "my_topic"; + private Map + fullHouse = + ImmutableMap.of(new TopicPartition(topic, 0), + new KafkaPullerInputSplit(topic, 0, 0, 45, PATH), + new TopicPartition(topic, 1), + new KafkaPullerInputSplit(topic, 1, 5, 1005, PATH), + new TopicPartition(topic, 2), + new KafkaPullerInputSplit(topic, 2, 9, 100, PATH), + new TopicPartition(topic, 3), + new KafkaPullerInputSplit(topic, 3, 0, 100, PATH)); + + @Test public void computeOptimizedScanPartitionBinaryOpFilter() { + KafkaScanTrimmer kafkaScanTrimmer = new KafkaScanTrimmer(fullHouse, null); + int partitionId = 2; + ExprNodeDesc constant = ConstantExprBuilder.build(partitionId); + final List children = Lists.newArrayList(partitionColumn, constant); + + ExprNodeGenericFuncDesc node = eq(children); + assertNotNull(node); + + Map + actual = + kafkaScanTrimmer.computeOptimizedScan(SerializationUtilities + .deserializeExpression(SerializationUtilities.serializeExpression(node))); + Map expected = Maps.filterValues(fullHouse, tp -> Objects.requireNonNull(tp).getPartition() == partitionId); + Assert.assertEquals(expected, actual); + + ExprNodeGenericFuncDesc lessNode = lessThan(children); + assertNotNull(lessNode); + actual = + kafkaScanTrimmer.computeOptimizedScan(SerializationUtilities + .deserializeExpression(SerializationUtilities.serializeExpression(lessNode))); + expected = Maps.filterValues(fullHouse, tp -> Objects.requireNonNull(tp).getPartition() < partitionId); + Assert.assertEquals(expected, actual); + + ExprNodeGenericFuncDesc lessEqNode = lessThanEq(children); + + assertNotNull(lessEqNode); + actual = + kafkaScanTrimmer.computeOptimizedScan(SerializationUtilities + .deserializeExpression(SerializationUtilities.serializeExpression(lessEqNode))); + expected = Maps.filterValues(fullHouse, tp -> Objects.requireNonNull(tp).getPartition() <= partitionId); + Assert.assertEquals(expected, actual); + + } + + @Test public void computeOptimizedScanFalseFilter() { + KafkaScanTrimmer kafkaScanTrimmer = new KafkaScanTrimmer(fullHouse, null); + ExprNodeGenericFuncDesc + falseFilter = + and(Lists.newArrayList(eq(Lists.newArrayList(partitionColumn, zeroInt)), + eq(Lists.newArrayList(partitionColumn, threeInt)))); + + assertNotNull(falseFilter); + Map + actual = + kafkaScanTrimmer.computeOptimizedScan(SerializationUtilities + .deserializeExpression(SerializationUtilities.serializeExpression(falseFilter))); + Assert.assertTrue(actual.isEmpty()); + + ExprNodeGenericFuncDesc + falseFilter2 = + and(Lists.newArrayList(eq(Lists.newArrayList(offsetColumn, thirtyFiveLong)), + eq(Lists.newArrayList(offsetColumn, fortyLong)))); + + assertNotNull(falseFilter2); + actual = + kafkaScanTrimmer.computeOptimizedScan(SerializationUtilities + .deserializeExpression(SerializationUtilities.serializeExpression(falseFilter2))); + Assert.assertTrue(actual.isEmpty()); + + ExprNodeGenericFuncDesc filter3 = or(Lists.newArrayList(falseFilter, falseFilter2)); + + assertNotNull(filter3); + actual = + kafkaScanTrimmer.computeOptimizedScan(SerializationUtilities + .deserializeExpression(SerializationUtilities.serializeExpression(filter3))); + Assert.assertTrue(actual.isEmpty()); + + ExprNodeGenericFuncDesc + filter4 = + and(Lists.newArrayList(filter3, eq(Lists.newArrayList(partitionColumn, zeroInt)))); + assertNotNull(filter4); + actual = + kafkaScanTrimmer.computeOptimizedScan(SerializationUtilities + .deserializeExpression(SerializationUtilities.serializeExpression(filter4))); + Assert.assertTrue(actual.isEmpty()); + } + + @Test public void computeOptimizedScanOrAndCombinedFilter() { + KafkaScanTrimmer kafkaScanTrimmer = new KafkaScanTrimmer(fullHouse, null); + // partition = 0 and 30 <= offset < 35 or partition = 3 and 35 <= offset < 75 or (partition = 0 and offset = 40) + + ExprNodeGenericFuncDesc + part1 = + and(Lists.newArrayList(greaterThanEq(Lists.newArrayList(offsetColumn, thirtyLong)), + eq(Lists.newArrayList(partitionColumn, zeroInt)), + lessThan(Lists.newArrayList(offsetColumn, thirtyFiveLong)))); + + ExprNodeGenericFuncDesc + part2 = + and(Lists.newArrayList(greaterThanEq(Lists.newArrayList(offsetColumn, thirtyFiveLong)), + eq(Lists.newArrayList(partitionColumn, threeInt)), + lessThan(Lists.newArrayList(offsetColumn, seventyFiveLong)))); + + ExprNodeGenericFuncDesc + part3 = + and(Lists.newArrayList(eq(Lists.newArrayList(offsetColumn, fortyLong)), + eq(Lists.newArrayList(partitionColumn, zeroInt)))); + + ExprNodeGenericFuncDesc orExpression = or(Lists.newArrayList(part1, part2, part3)); + + assertNotNull(orExpression); + Map + actual = + kafkaScanTrimmer.computeOptimizedScan(SerializationUtilities + .deserializeExpression(SerializationUtilities.serializeExpression(orExpression))); + TopicPartition tpZero = new TopicPartition(topic, 0); + TopicPartition toThree = new TopicPartition(topic, 3); + KafkaPullerInputSplit split1 = new KafkaPullerInputSplit(topic, 0, 30, 41, PATH); + KafkaPullerInputSplit split2 = new KafkaPullerInputSplit(topic, 3, 35, 75, PATH); + + Map expected = ImmutableMap.of(tpZero, split1, toThree, split2); + Assert.assertEquals(expected, actual); + + } + + @Test public void computeOptimizedScanPartitionOrAndCombinedFilter() { + KafkaScanTrimmer kafkaScanTrimmer = new KafkaScanTrimmer(fullHouse, null); + + // partition = 1 or (partition >2 and <= 3) + ExprNodeGenericFuncDesc eq = eq(Lists.newArrayList(partitionColumn, ConstantExprBuilder.build(1))); + ExprNodeGenericFuncDesc lessEq = lessThanEq(Lists.newArrayList(partitionColumn, ConstantExprBuilder.build(3))); + ExprNodeGenericFuncDesc greater = greaterThan(Lists.newArrayList(partitionColumn, ConstantExprBuilder.build(2))); + ExprNodeGenericFuncDesc orNode = or(Lists.newArrayList(and(Lists.newArrayList(lessEq, greater)), eq)); + + Map + actual = + kafkaScanTrimmer.computeOptimizedScan(SerializationUtilities + .deserializeExpression(SerializationUtilities.serializeExpression(orNode))); + Map + expected = + Maps.filterValues(fullHouse, tp -> Objects.requireNonNull(tp).getPartition() == 1 || tp.getPartition() == 3); + Assert.assertEquals(expected, actual); + assertNotNull(orNode); + } + + @Test public void buildScanFormPartitionPredicateEq() { + Map + actual = + KafkaScanTrimmer.buildScanFromPartitionPredicate(fullHouse, PredicateLeaf.Operator.EQUALS, 3, false, false); + TopicPartition topicPartition = new TopicPartition(topic, 3); + Assert.assertEquals(fullHouse.get(topicPartition), actual.get(topicPartition)); + } + + @Test public void buildScanFormPartitionPredicateLess() { + // partitionConst < partitionColumn (flip true) + int partitionConst = 2; + Map + actual = + KafkaScanTrimmer.buildScanFromPartitionPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN, + partitionConst, + true, + false); + + Map + expected = + Maps.filterEntries(fullHouse, entry -> Objects.requireNonNull(entry).getKey().partition() > partitionConst); + Assert.assertEquals(expected, actual); + Assert.assertFalse(actual.isEmpty()); + + // partitionConst >= partitionColumn (flip true, negation true) + actual = + KafkaScanTrimmer.buildScanFromPartitionPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN, + partitionConst, + true, + true); + + expected = + Maps.filterEntries(fullHouse, entry -> partitionConst >= Objects.requireNonNull(entry).getKey().partition()); + Assert.assertEquals(expected, actual); + Assert.assertFalse(actual.isEmpty()); + + // partitionColumn >= partitionConst (negation true) + actual = + KafkaScanTrimmer.buildScanFromPartitionPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN, + partitionConst, + false, + true); + + expected = + Maps.filterEntries(fullHouse, entry -> Objects.requireNonNull(entry).getKey().partition() >= partitionConst); + Assert.assertEquals(expected, actual); + Assert.assertFalse(actual.isEmpty()); + + // partitionColumn < partitionConst (negation true) + actual = + KafkaScanTrimmer.buildScanFromPartitionPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN, + partitionConst, + false, + false); + + expected = + Maps.filterEntries(fullHouse, entry -> Objects.requireNonNull(entry).getKey().partition() < partitionConst); + Assert.assertEquals(expected, actual); + Assert.assertFalse(actual.isEmpty()); + } + + @Test public void buildScanFormPartitionPredicateLessEq() { + // partitionConst <= partitionColumn (flip true) + int partitionConst = 2; + Map + actual = + KafkaScanTrimmer.buildScanFromPartitionPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN_EQUALS, + partitionConst, + true, + false); + + Map + expected = + Maps.filterEntries(fullHouse, entry -> Objects.requireNonNull(entry).getKey().partition() >= partitionConst); + Assert.assertEquals(expected, actual); + Assert.assertFalse(actual.isEmpty()); + + // partitionConst > partitionColumn (flip true, negation true) + actual = + KafkaScanTrimmer.buildScanFromPartitionPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN_EQUALS, + partitionConst, + true, + true); + + expected = + Maps.filterEntries(fullHouse, entry -> partitionConst > Objects.requireNonNull(entry).getKey().partition()); + Assert.assertEquals(expected, actual); + Assert.assertFalse(actual.isEmpty()); + + // partitionColumn > partitionConst (negation true) + actual = + KafkaScanTrimmer.buildScanFromPartitionPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN_EQUALS, + partitionConst, + false, + true); + + expected = + Maps.filterEntries(fullHouse, entry -> Objects.requireNonNull(entry).getKey().partition() > partitionConst); + Assert.assertEquals(expected, actual); + Assert.assertFalse(actual.isEmpty()); + + // partitionColumn <= partitionConst (negation true) + actual = + KafkaScanTrimmer.buildScanFromPartitionPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN_EQUALS, + partitionConst, + false, + false); + + expected = + Maps.filterEntries(fullHouse, entry -> Objects.requireNonNull(entry).getKey().partition() <= partitionConst); + Assert.assertEquals(expected, actual); + Assert.assertFalse(actual.isEmpty()); + } + + @Test public void buildScanFromOffsetPredicateEq() { + long constantOffset = 30; + Map + actual = + KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, + PredicateLeaf.Operator.EQUALS, + constantOffset, + false, + false); + Map + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + constantOffset, + constantOffset + 1, + entry.getPath())); + + Assert.assertEquals(expected, actual); + + // seek to end if offset is out of reach + actual = + KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, PredicateLeaf.Operator.EQUALS, 3000000L, false, false); + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + entry.getEndOffset(), + entry.getEndOffset(), + entry.getPath())); + Assert.assertEquals(expected, actual); + + // seek to end if offset is out of reach + actual = KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, PredicateLeaf.Operator.EQUALS, 0L, false, false); + + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + entry.getStartOffset() > 0 ? entry.getEndOffset() : 0, + entry.getStartOffset() > 0 ? entry.getEndOffset() : 1, + entry.getPath())); + Assert.assertEquals(expected, actual); + + } + + @Test public void buildScanFromOffsetPredicateLess() { + long constantOffset = 50; + // columnOffset < constant + Map + actual = + KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN, + constantOffset, + false, + false); + + Map + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + entry.getStartOffset(), + Math.min(constantOffset, entry.getEndOffset()), + entry.getPath())); + Assert.assertEquals(expected, actual); + + // columnOffset > constant + actual = + KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN, + constantOffset, + true, + false); + + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + Math.min(entry.getEndOffset(), Math.max(entry.getStartOffset(), constantOffset + 1)), + entry.getEndOffset(), + entry.getPath())); + Assert.assertEquals(expected, actual); + + // columnOffset >= constant + actual = + KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN, + constantOffset, + false, + true); + + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + Math.min(entry.getEndOffset(), Math.max(entry.getStartOffset(), constantOffset)), + entry.getEndOffset(), + entry.getPath())); + Assert.assertEquals(expected, actual); + + // columnOffset <= constant + actual = + KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN, + constantOffset, + true, + true); + + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + entry.getStartOffset(), + Math.min(constantOffset + 1, entry.getEndOffset()), + entry.getPath())); + Assert.assertEquals(expected, actual); + + } + + @Test public void buildScanFromOffsetPredicateLessEq() { + long constantOffset = 50; + // columnOffset < constant + Map + actual = + KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN_EQUALS, + constantOffset, + false, + false); + + Map + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + entry.getStartOffset(), + Math.min(constantOffset + 1, entry.getEndOffset()), + entry.getPath())); + Assert.assertEquals(expected, actual); + + // columnOffset >= constant + actual = + KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN_EQUALS, + constantOffset, + true, + false); + + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + Math.min(entry.getEndOffset(), Math.max(entry.getStartOffset(), constantOffset)), + entry.getEndOffset(), + entry.getPath())); + Assert.assertEquals(expected, actual); + + // columnOffset > constant + actual = + KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN_EQUALS, + constantOffset, + false, + true); + + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + Math.min(entry.getEndOffset(), Math.max(entry.getStartOffset(), constantOffset + 1)), + entry.getEndOffset(), + entry.getPath())); + Assert.assertEquals(expected, actual); + + // columnOffset < constant + actual = + KafkaScanTrimmer.buildScanFromOffsetPredicate(fullHouse, + PredicateLeaf.Operator.LESS_THAN_EQUALS, + constantOffset, + true, + true); + + expected = + Maps.transformValues(fullHouse, + entry -> new KafkaPullerInputSplit(Objects.requireNonNull(entry).getTopic(), + entry.getPartition(), + entry.getStartOffset(), + Math.min(constantOffset, entry.getEndOffset()), + entry.getPath())); + Assert.assertEquals(expected, actual); + } + + private static class ConstantExprBuilder { + static ExprNodeDesc build(long constant) { + return new ExprNodeConstantDesc(TypeInfoFactory.longTypeInfo, constant); + } + + static ExprNodeDesc build(int constant) { + return new ExprNodeConstantDesc(TypeInfoFactory.longTypeInfo, constant); + } + } + + private static ExprNodeGenericFuncDesc or(List children) { + return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, new GenericUDFOPOr(), children); + } + + private static ExprNodeGenericFuncDesc and(List children) { + return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, new GenericUDFOPAnd(), children); + } + + private static ExprNodeGenericFuncDesc eq(List children) { + return new ExprNodeGenericFuncDesc(children.get(0).getTypeInfo(), new GenericUDFOPEqual(), children); + } + + private static ExprNodeGenericFuncDesc lessThan(List children) { + return new ExprNodeGenericFuncDesc(children.get(0).getTypeInfo(), new GenericUDFOPLessThan(), children); + } + + private static ExprNodeGenericFuncDesc lessThanEq(List children) { + return new ExprNodeGenericFuncDesc(children.get(0).getTypeInfo(), new GenericUDFOPEqualOrLessThan(), children); + } + + private static ExprNodeGenericFuncDesc greaterThan(List children) { + return new ExprNodeGenericFuncDesc(children.get(0).getTypeInfo(), new GenericUDFOPGreaterThan(), children); + } + + private static ExprNodeGenericFuncDesc greaterThanEq(List children) { + return new ExprNodeGenericFuncDesc(children.get(0).getTypeInfo(), new GenericUDFOPEqualOrGreaterThan(), children); + } +} diff --git kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaStreamingUtilsTest.java kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaStreamingUtilsTest.java new file mode 100644 index 0000000000..8d68ec27c8 --- /dev/null +++ kafka-handler/src/test/org/apache/hadoop/hive/kafka/KafkaStreamingUtilsTest.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.kafka; + +import org.apache.hadoop.conf.Configuration; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.junit.Assert; +import org.junit.Test; + +import java.util.Properties; + +/** + * Test for Utility class. + */ +public class KafkaStreamingUtilsTest { + public KafkaStreamingUtilsTest() { + } + + @Test public void testConsumerProperties() { + Configuration configuration = new Configuration(); + configuration.set("kafka.bootstrap.servers", "localhost:9090"); + configuration.set("kafka.consumer.fetch.max.wait.ms", "40"); + configuration.set("kafka.consumer.my.new.wait.ms", "400"); + Properties properties = KafkaStreamingUtils.consumerProperties(configuration); + Assert.assertEquals("localhost:9090", properties.getProperty("bootstrap.servers")); + Assert.assertEquals("40", properties.getProperty("fetch.max.wait.ms")); + Assert.assertEquals("400", properties.getProperty("my.new.wait.ms")); + } + + @Test(expected = IllegalArgumentException.class) public void canNotSetForbiddenProp() { + Configuration configuration = new Configuration(); + configuration.set("kafka.bootstrap.servers", "localhost:9090"); + configuration.set("kafka.consumer." + ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); + KafkaStreamingUtils.consumerProperties(configuration); + } + + @Test(expected = IllegalArgumentException.class) public void canNotSetForbiddenProp2() { + Configuration configuration = new Configuration(); + configuration.set("kafka.bootstrap.servers", "localhost:9090"); + configuration.set("kafka.consumer." + ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "value"); + KafkaStreamingUtils.consumerProperties(configuration); + } +} diff --git kafka-handler/src/test/org/apache/hadoop/hive/kafka/package-info.java kafka-handler/src/test/org/apache/hadoop/hive/kafka/package-info.java new file mode 100644 index 0000000000..8a0d8fd0b0 --- /dev/null +++ kafka-handler/src/test/org/apache/hadoop/hive/kafka/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Package info file. + */ + +package org.apache.hadoop.hive.kafka; diff --git llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapServiceDriver.java llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapServiceDriver.java index 0f1d5eea4c..ffdd340fde 100644 --- llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapServiceDriver.java +++ llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapServiceDriver.java @@ -18,44 +18,9 @@ package org.apache.hadoop.hive.llap.cli; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.net.URI; -import java.net.URISyntaxException; -import java.net.URL; -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Collection; -import java.util.List; -import java.util.Properties; -import java.util.Set; -import java.util.concurrent.Callable; -import java.util.concurrent.CompletionService; -import java.util.concurrent.ExecutorCompletionService; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; - +import com.google.common.base.Preconditions; +import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.hive.llap.LlapUtil; -import org.apache.hadoop.hive.llap.configuration.LlapDaemonConfiguration; -import org.apache.hadoop.hive.llap.daemon.impl.LlapConstants; -import org.apache.hadoop.hive.llap.daemon.impl.StaticPermanentFunctionChecker; -import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos; -import org.apache.hadoop.hive.llap.tezplugins.LlapTezUtils; -import org.apache.hadoop.registry.client.binding.RegistryUtils; -import org.apache.tez.dag.api.TezConfiguration; -import org.codehaus.jettison.json.JSONException; -import org.codehaus.jettison.json.JSONObject; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; @@ -64,8 +29,14 @@ import org.apache.hadoop.hive.common.CompressionUtils; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; +import org.apache.hadoop.hive.llap.LlapUtil; import org.apache.hadoop.hive.llap.cli.LlapOptionsProcessor.LlapOptions; +import org.apache.hadoop.hive.llap.configuration.LlapDaemonConfiguration; +import org.apache.hadoop.hive.llap.daemon.impl.LlapConstants; +import org.apache.hadoop.hive.llap.daemon.impl.StaticPermanentFunctionChecker; +import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos; import org.apache.hadoop.hive.llap.io.api.impl.LlapInputFormat; +import org.apache.hadoop.hive.llap.tezplugins.LlapTezUtils; import org.apache.hadoop.hive.metastore.api.Function; import org.apache.hadoop.hive.metastore.api.ResourceUri; import org.apache.hadoop.hive.ql.exec.Utilities; @@ -77,22 +48,50 @@ import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.registry.client.binding.RegistryUtils; import org.apache.hadoop.yarn.conf.YarnConfiguration; +import org.apache.tez.dag.api.TezConfiguration; +import org.codehaus.jettison.json.JSONException; +import org.codehaus.jettison.json.JSONObject; import org.eclipse.jetty.rewrite.handler.Rule; import org.eclipse.jetty.util.ssl.SslContextFactory; import org.joda.time.DateTime; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -import com.google.common.base.Preconditions; -import com.google.common.util.concurrent.ThreadFactoryBuilder; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; public class LlapServiceDriver { protected static final Logger LOG = LoggerFactory.getLogger(LlapServiceDriver.class.getName()); - private static final String[] DEFAULT_AUX_CLASSES = new String[] { - "org.apache.hive.hcatalog.data.JsonSerDe","org.apache.hadoop.hive.druid.DruidStorageHandler", - "org.apache.hive.storage.jdbc.JdbcStorageHandler", "org.apache.commons.dbcp.BasicDataSourceFactory", - "org.apache.commons.pool.impl.GenericObjectPool" - }; + private static final String[] + DEFAULT_AUX_CLASSES = + new String[] { "org.apache.hive.hcatalog.data.JsonSerDe", "org.apache.hadoop.hive.druid.DruidStorageHandler", + "org.apache.hive.storage.jdbc.JdbcStorageHandler", "org.apache.commons.dbcp.BasicDataSourceFactory", + "org.apache.commons.pool.impl.GenericObjectPool", "org.apache.hadoop.hive.kafka.KafkaStorageHandler" }; private static final String HBASE_SERDE_CLASS = "org.apache.hadoop.hive.hbase.HBaseSerDe"; private static final String[] NEEDED_CONFIGS = LlapDaemonConfiguration.DAEMON_CONFIGS; private static final String[] OPTIONAL_CONFIGS = LlapDaemonConfiguration.SSL_DAEMON_CONFIGS; diff --git packaging/pom.xml packaging/pom.xml index 5c859acfad..0f0037bd61 100644 --- packaging/pom.xml +++ packaging/pom.xml @@ -213,6 +213,11 @@ hive-druid-handler ${project.version} + + org.apache.hive + kafka-handler + ${project.version} + org.apache.hive hive-jdbc-handler diff --git pom.xml pom.xml index 7503cff532..9c15328ecc 100644 --- pom.xml +++ pom.xml @@ -63,6 +63,7 @@ packaging standalone-metastore upgrade-acid + kafka-handler @@ -80,6 +81,7 @@ + ${maven.test.classpath} file:// ${project.build.directory}/tmp @@ -1480,7 +1482,7 @@ org.apache.maven.plugins maven-javadoc-plugin - false + -Xdoclint:none @@ -1566,6 +1568,7 @@ ${basedir}/${hive.path.to.root}/testutils/hadoop.cmd + ;${env.HADOOP_HOME}/bin ${project.build.directory}/deplibs/* file:///${test.tmp.dir} diff --git ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java index da31f4d9a2..f39ba87a80 100644 --- ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java +++ ql/src/java/org/apache/hadoop/hive/ql/index/IndexPredicateAnalyzer.java @@ -17,16 +17,6 @@ */ package org.apache.hadoop.hive.ql.index; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.Stack; - import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; @@ -44,6 +34,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToBinary; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToChar; @@ -54,7 +45,16 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUtcTimestamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToVarchar; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; /** * IndexPredicateAnalyzer decomposes predicates, separating the parts @@ -181,7 +181,7 @@ public Object process(Node nd, Stack stack, //Check if ExprNodeColumnDesc is wrapped in expr. //If so, peel off. Otherwise return itself. - private ExprNodeDesc getColumnExpr(ExprNodeDesc expr) { + private static ExprNodeDesc getColumnExpr(ExprNodeDesc expr) { if (expr instanceof ExprNodeColumnDesc) { return expr; } diff --git ql/src/test/queries/clientpositive/kafka_storage_handler.q ql/src/test/queries/clientpositive/kafka_storage_handler.q new file mode 100644 index 0000000000..8daa3e3bc0 --- /dev/null +++ ql/src/test/queries/clientpositive/kafka_storage_handler.q @@ -0,0 +1,236 @@ +SET hive.vectorized.execution.enabled=false; + +CREATE EXTERNAL TABLE kafka_table +(`__time` timestamp , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint) +STORED BY 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' +WITH SERDEPROPERTIES ("timestamp.formats"="yyyy-MM-dd\'T\'HH:mm:ss\'Z\'") +TBLPROPERTIES +("kafka.topic" = "test-topic", +"kafka.bootstrap.servers"="localhost:9092", +"kafka.serde.class"="org.apache.hadoop.hive.serde2.JsonSerDe") +; + +DESCRIBE EXTENDED kafka_table; + +Select `__partition` ,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta FROM kafka_table; + +Select count(*) FROM kafka_table; + +Select `__partition`, `__offset`,`__start_offset`,`__end_offset`, `__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +from kafka_table where `__timestamp` > 1533960760123; +Select `__partition`, `__offset` ,`__start_offset`,`__end_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +from kafka_table where `__timestamp` > 533960760123; + +Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +from kafka_table where (`__offset` > 7 and `__partition` = 0 and `__offset` <9 ) OR +`__offset` = 4 and `__partition` = 0 OR (`__offset` <= 1 and `__partition` = 0 and `__offset` > 0); + +Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user` from kafka_table where `__offset` = 5; + +Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user` from kafka_table where `__offset` < 5; + +Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user` from kafka_table where `__offset` > 5; + +-- Timestamp filter + +Select `__partition`,`__start_offset`,`__end_offset`, `__offset`, `user` from kafka_table where +`__timestamp` > 1000 * to_unix_timestamp(CURRENT_TIMESTAMP - interval '1' HOURS) ; + +-- non existing partition +Select count(*) from kafka_table where `__partition` = 1; + +-- non existing offset +Select count(*) from kafka_table where `__offset` = 100; + +-- less than non existing offset and partition +Select count(*) from kafka_table where `__offset` <= 100 and `__partition` <= 100; + +Drop table kafka_table_offsets; +create table kafka_table_offsets(partition_id int, max_offset bigint, insert_time timestamp); + +insert overwrite table kafka_table_offsets select `__partition`, min(`__offset`) - 1, CURRENT_TIMESTAMP from kafka_table group by `__partition`, CURRENT_TIMESTAMP ; + +-- check initial state is 0 for partition and 0 offsets +select partition_id, max_offset from kafka_table_offsets; + +Drop table orc_kafka_table; +Create table orc_kafka_table (partition_id int, row_offset bigint, kafka_ts bigint, + `__time` timestamp , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint +) stored as ORC; + + +From kafka_table ktable JOIN kafka_table_offsets offset_table +on (ktable.`__partition` = offset_table.partition_id and ktable.`__offset` > offset_table.max_offset and ktable.`__offset` < 3 ) +insert into table orc_kafka_table select `__partition`, `__offset`, `__timestamp`, +`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +Insert overwrite table kafka_table_offsets select +`__partition`, max(`__offset`), CURRENT_TIMESTAMP group by `__partition`, CURRENT_TIMESTAMP; + +-- should ingest only first 3 rows +select count(*) from orc_kafka_table; + +-- check max offset is 2 +select partition_id, max_offset from kafka_table_offsets; + +-- 3 rows form 0 to 2 +select `partition_id`, `row_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta from orc_kafka_table; + + +-- insert the rest using inner join + +From kafka_table ktable JOIN kafka_table_offsets offset_table +on (ktable.`__partition` = offset_table.partition_id and ktable.`__offset` > offset_table.max_offset) +insert into table orc_kafka_table select `__partition`, `__offset`, `__timestamp`, +`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +Insert overwrite table kafka_table_offsets select +`__partition`, max(`__offset`), CURRENT_TIMESTAMP group by `__partition`, CURRENT_TIMESTAMP; + +-- check that max offset is 9 +select partition_id, max_offset from kafka_table_offsets; + +-- 10 rows +select count(*) from orc_kafka_table; + +-- no duplicate or missing data +select `partition_id`, `row_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta from orc_kafka_table; + +-- LEFT OUTER JOIN if metadata is empty + +Drop table kafka_table_offsets; +create table kafka_table_offsets(partition_id int, max_offset bigint, insert_time timestamp); + +Drop table orc_kafka_table; +Create table orc_kafka_table (partition_id int, row_offset bigint, kafka_ts bigint, + `__time` timestamp , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint +) stored as ORC; + + +From kafka_table ktable LEFT OUTER JOIN kafka_table_offsets offset_table +on (ktable.`__partition` = offset_table.partition_id and ktable.`__offset` > offset_table.max_offset ) +insert into table orc_kafka_table select `__partition`, `__offset`, `__timestamp`, +`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +Insert overwrite table kafka_table_offsets select +`__partition`, max(`__offset`), CURRENT_TIMESTAMP group by `__partition`, CURRENT_TIMESTAMP; + +select count(*) from orc_kafka_table; + +select partition_id, max_offset from kafka_table_offsets; + +select `partition_id`, `row_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta from orc_kafka_table; + +-- using basic implementation of flat json probably to be removed +CREATE EXTERNAL TABLE kafka_table_2 +(`__time` timestamp with local time zone , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint) +STORED BY 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' +TBLPROPERTIES +("kafka.topic" = "test-topic", +"kafka.bootstrap.servers"="localhost:9092"); + +Select `__partition`, `__offset`, `__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +FROM kafka_table_2; + +Select count(*) FROM kafka_table_2; + + +CREATE EXTERNAL TABLE wiki_kafka_avro_table +STORED BY 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' +TBLPROPERTIES +("kafka.topic" = "wiki_kafka_avro_table", +"kafka.bootstrap.servers"="localhost:9092", +"kafka.serde.class"="org.apache.hadoop.hive.serde2.avro.AvroSerDe", +'avro.schema.literal'='{ + "type" : "record", + "name" : "Wikipedia", + "namespace" : "org.apache.hive.kafka", + "version": "1", + "fields" : [ { + "name" : "isrobot", + "type" : "boolean" + }, { + "name" : "channel", + "type" : "string" + }, { + "name" : "timestamp", + "type" : "string" + }, { + "name" : "flags", + "type" : "string" + }, { + "name" : "isunpatrolled", + "type" : "boolean" + }, { + "name" : "page", + "type" : "string" + }, { + "name" : "diffurl", + "type" : "string" + }, { + "name" : "added", + "type" : "long" + }, { + "name" : "comment", + "type" : "string" + }, { + "name" : "commentlength", + "type" : "long" + }, { + "name" : "isnew", + "type" : "boolean" + }, { + "name" : "isminor", + "type" : "boolean" + }, { + "name" : "delta", + "type" : "long" + }, { + "name" : "isanonymous", + "type" : "boolean" + }, { + "name" : "user", + "type" : "string" + }, { + "name" : "deltabucket", + "type" : "double" + }, { + "name" : "deleted", + "type" : "long" + }, { + "name" : "namespace", + "type" : "string" + } ] +}' +); + +describe extended wiki_kafka_avro_table; + +select cast ((`__timestamp`/1000) as timestamp) as kafka_record_ts, `__partition`, `__offset`, `timestamp`, `user`, `page`, `deleted`, `deltabucket`, `isanonymous`, `commentlength` from wiki_kafka_avro_table; + +select count(*) from wiki_kafka_avro_table; + +select count(distinct `user`) from wiki_kafka_avro_table; + +select sum(deltabucket), min(commentlength) from wiki_kafka_avro_table; + +select cast ((`__timestamp`/1000) as timestamp) as kafka_record_ts, `__timestamp` as kafka_record_ts_long, +`__partition`, `__start_offset`,`__end_offset`,`__offset`, `timestamp`, `user`, `page`, `deleted`, `deltabucket`, +`isanonymous`, `commentlength` from wiki_kafka_avro_table where `__timestamp` > 1534750625090; + diff --git ql/src/test/results/clientpositive/druid/kafka_storage_handler.q.out ql/src/test/results/clientpositive/druid/kafka_storage_handler.q.out new file mode 100644 index 0000000000..3dec33d790 --- /dev/null +++ ql/src/test/results/clientpositive/druid/kafka_storage_handler.q.out @@ -0,0 +1,845 @@ +PREHOOK: query: CREATE EXTERNAL TABLE kafka_table +(`__time` timestamp , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint) +STORED BY 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' +WITH SERDEPROPERTIES ("timestamp.formats"="yyyy-MM-dd\'T\'HH:mm:ss\'Z\'") +TBLPROPERTIES +("kafka.topic" = "test-topic", +"kafka.bootstrap.servers"="localhost:9092", +"kafka.serde.class"="org.apache.hadoop.hive.serde2.JsonSerDe") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@kafka_table +POSTHOOK: query: CREATE EXTERNAL TABLE kafka_table +(`__time` timestamp , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint) +STORED BY 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' +WITH SERDEPROPERTIES ("timestamp.formats"="yyyy-MM-dd\'T\'HH:mm:ss\'Z\'") +TBLPROPERTIES +("kafka.topic" = "test-topic", +"kafka.bootstrap.servers"="localhost:9092", +"kafka.serde.class"="org.apache.hadoop.hive.serde2.JsonSerDe") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@kafka_table +PREHOOK: query: DESCRIBE EXTENDED kafka_table +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@kafka_table +POSTHOOK: query: DESCRIBE EXTENDED kafka_table +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@kafka_table +__time timestamp from deserializer +page string from deserializer +user string from deserializer +language string from deserializer +country string from deserializer +continent string from deserializer +namespace string from deserializer +newpage boolean from deserializer +unpatrolled boolean from deserializer +anonymous boolean from deserializer +robot boolean from deserializer +added int from deserializer +deleted int from deserializer +delta bigint from deserializer +__partition int from deserializer +__offset bigint from deserializer +__timestamp bigint from deserializer +__start_offset bigint from deserializer +__end_offset bigint from deserializer + +#### A masked pattern was here #### +PREHOOK: query: Select `__partition` ,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta FROM kafka_table +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select `__partition` ,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta FROM kafka_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 0 10 0 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 0 10 1 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 0 10 2 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 0 10 3 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 0 10 4 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +0 0 10 5 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 0 10 6 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 0 10 7 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 0 10 8 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 0 10 9 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +PREHOOK: query: Select count(*) FROM kafka_table +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select count(*) FROM kafka_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +10 +PREHOOK: query: Select `__partition`, `__offset`,`__start_offset`,`__end_offset`, `__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +from kafka_table where `__timestamp` > 1533960760123 +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select `__partition`, `__offset`,`__start_offset`,`__end_offset`, `__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +from kafka_table where `__timestamp` > 1533960760123 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 0 0 10 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 1 0 10 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 2 0 10 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 3 0 10 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 4 0 10 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +0 5 0 10 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 6 0 10 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 7 0 10 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 8 0 10 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 9 0 10 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +PREHOOK: query: Select `__partition`, `__offset` ,`__start_offset`,`__end_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +from kafka_table where `__timestamp` > 533960760123 +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select `__partition`, `__offset` ,`__start_offset`,`__end_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +from kafka_table where `__timestamp` > 533960760123 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 0 0 10 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 1 0 10 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 2 0 10 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 3 0 10 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 4 0 10 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +0 5 0 10 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 6 0 10 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 7 0 10 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 8 0 10 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 9 0 10 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +PREHOOK: query: Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +from kafka_table where (`__offset` > 7 and `__partition` = 0 and `__offset` <9 ) OR +`__offset` = 4 and `__partition` = 0 OR (`__offset` <= 1 and `__partition` = 0 and `__offset` > 0) +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +from kafka_table where (`__offset` > 7 and `__partition` = 0 and `__offset` <9 ) OR +`__offset` = 4 and `__partition` = 0 OR (`__offset` <= 1 and `__partition` = 0 and `__offset` > 0) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 1 9 1 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 1 9 4 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +0 1 9 8 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +PREHOOK: query: Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user` from kafka_table where `__offset` = 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user` from kafka_table where `__offset` = 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 5 6 5 NULL Gypsy Danger nuclear +PREHOOK: query: Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user` from kafka_table where `__offset` < 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user` from kafka_table where `__offset` < 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 0 5 0 NULL Gypsy Danger nuclear +0 0 5 1 NULL Striker Eureka speed +0 0 5 2 NULL Cherno Alpha masterYi +0 0 5 3 NULL Crimson Typhoon triplets +0 0 5 4 NULL Coyote Tango stringer +PREHOOK: query: Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user` from kafka_table where `__offset` > 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select `__partition`,`__start_offset`,`__end_offset`, `__offset`,`__time`, `page`, `user` from kafka_table where `__offset` > 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 6 10 6 NULL Striker Eureka speed +0 6 10 7 NULL Cherno Alpha masterYi +0 6 10 8 NULL Crimson Typhoon triplets +0 6 10 9 NULL Coyote Tango stringer +PREHOOK: query: Select `__partition`,`__start_offset`,`__end_offset`, `__offset`, `user` from kafka_table where +`__timestamp` > 1000 * to_unix_timestamp(CURRENT_TIMESTAMP - interval '1' HOURS) +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select `__partition`,`__start_offset`,`__end_offset`, `__offset`, `user` from kafka_table where +`__timestamp` > 1000 * to_unix_timestamp(CURRENT_TIMESTAMP - interval '1' HOURS) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 0 10 0 nuclear +0 0 10 1 speed +0 0 10 2 masterYi +0 0 10 3 triplets +0 0 10 4 stringer +0 0 10 5 nuclear +0 0 10 6 speed +0 0 10 7 masterYi +0 0 10 8 triplets +0 0 10 9 stringer +PREHOOK: query: Select count(*) from kafka_table where `__partition` = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select count(*) from kafka_table where `__partition` = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 +PREHOOK: query: Select count(*) from kafka_table where `__offset` = 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select count(*) from kafka_table where `__offset` = 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 +PREHOOK: query: Select count(*) from kafka_table where `__offset` <= 100 and `__partition` <= 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select count(*) from kafka_table where `__offset` <= 100 and `__partition` <= 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +10 +PREHOOK: query: Drop table kafka_table_offsets +PREHOOK: type: DROPTABLE +POSTHOOK: query: Drop table kafka_table_offsets +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table kafka_table_offsets(partition_id int, max_offset bigint, insert_time timestamp) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@kafka_table_offsets +POSTHOOK: query: create table kafka_table_offsets(partition_id int, max_offset bigint, insert_time timestamp) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@kafka_table_offsets +PREHOOK: query: insert overwrite table kafka_table_offsets select `__partition`, min(`__offset`) - 1, CURRENT_TIMESTAMP from kafka_table group by `__partition`, CURRENT_TIMESTAMP +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Output: default@kafka_table_offsets +POSTHOOK: query: insert overwrite table kafka_table_offsets select `__partition`, min(`__offset`) - 1, CURRENT_TIMESTAMP from kafka_table group by `__partition`, CURRENT_TIMESTAMP +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Output: default@kafka_table_offsets +POSTHOOK: Lineage: kafka_table_offsets.insert_time SIMPLE [] +POSTHOOK: Lineage: kafka_table_offsets.max_offset EXPRESSION [(kafka_table)kafka_table.FieldSchema(name:__offset, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: kafka_table_offsets.partition_id SIMPLE [(kafka_table)kafka_table.FieldSchema(name:__partition, type:int, comment:from deserializer), ] +PREHOOK: query: select partition_id, max_offset from kafka_table_offsets +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table_offsets +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select partition_id, max_offset from kafka_table_offsets +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table_offsets +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 -1 +PREHOOK: query: Drop table orc_kafka_table +PREHOOK: type: DROPTABLE +POSTHOOK: query: Drop table orc_kafka_table +POSTHOOK: type: DROPTABLE +PREHOOK: query: Create table orc_kafka_table (partition_id int, row_offset bigint, kafka_ts bigint, + `__time` timestamp , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint +) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_kafka_table +POSTHOOK: query: Create table orc_kafka_table (partition_id int, row_offset bigint, kafka_ts bigint, + `__time` timestamp , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint +) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_kafka_table +PREHOOK: query: From kafka_table ktable JOIN kafka_table_offsets offset_table +on (ktable.`__partition` = offset_table.partition_id and ktable.`__offset` > offset_table.max_offset and ktable.`__offset` < 3 ) +insert into table orc_kafka_table select `__partition`, `__offset`, `__timestamp`, +`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +Insert overwrite table kafka_table_offsets select +`__partition`, max(`__offset`), CURRENT_TIMESTAMP group by `__partition`, CURRENT_TIMESTAMP +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Input: default@kafka_table_offsets +PREHOOK: Output: default@kafka_table_offsets +PREHOOK: Output: default@orc_kafka_table +POSTHOOK: query: From kafka_table ktable JOIN kafka_table_offsets offset_table +on (ktable.`__partition` = offset_table.partition_id and ktable.`__offset` > offset_table.max_offset and ktable.`__offset` < 3 ) +insert into table orc_kafka_table select `__partition`, `__offset`, `__timestamp`, +`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +Insert overwrite table kafka_table_offsets select +`__partition`, max(`__offset`), CURRENT_TIMESTAMP group by `__partition`, CURRENT_TIMESTAMP +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Input: default@kafka_table_offsets +POSTHOOK: Output: default@kafka_table_offsets +POSTHOOK: Output: default@orc_kafka_table +POSTHOOK: Lineage: kafka_table_offsets.insert_time EXPRESSION [] +POSTHOOK: Lineage: kafka_table_offsets.max_offset EXPRESSION [(kafka_table)ktable.FieldSchema(name:__offset, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: kafka_table_offsets.partition_id SIMPLE [(kafka_table)ktable.FieldSchema(name:__partition, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.__time SIMPLE [(kafka_table)ktable.FieldSchema(name:__time, type:timestamp, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.added SIMPLE [(kafka_table)ktable.FieldSchema(name:added, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.anonymous SIMPLE [(kafka_table)ktable.FieldSchema(name:anonymous, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.continent SIMPLE [(kafka_table)ktable.FieldSchema(name:continent, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.country SIMPLE [(kafka_table)ktable.FieldSchema(name:country, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.deleted SIMPLE [(kafka_table)ktable.FieldSchema(name:deleted, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.delta SIMPLE [(kafka_table)ktable.FieldSchema(name:delta, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.kafka_ts SIMPLE [(kafka_table)ktable.FieldSchema(name:__timestamp, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.language SIMPLE [(kafka_table)ktable.FieldSchema(name:language, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.namespace SIMPLE [(kafka_table)ktable.FieldSchema(name:namespace, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.newpage SIMPLE [(kafka_table)ktable.FieldSchema(name:newpage, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.page SIMPLE [(kafka_table)ktable.FieldSchema(name:page, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.partition_id SIMPLE [(kafka_table)ktable.FieldSchema(name:__partition, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.robot SIMPLE [(kafka_table)ktable.FieldSchema(name:robot, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.row_offset SIMPLE [(kafka_table)ktable.FieldSchema(name:__offset, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.unpatrolled SIMPLE [(kafka_table)ktable.FieldSchema(name:unpatrolled, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.user SIMPLE [(kafka_table)ktable.FieldSchema(name:user, type:string, comment:from deserializer), ] +PREHOOK: query: select count(*) from orc_kafka_table +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(*) from orc_kafka_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +3 +PREHOOK: query: select partition_id, max_offset from kafka_table_offsets +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table_offsets +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select partition_id, max_offset from kafka_table_offsets +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table_offsets +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 2 +PREHOOK: query: select `partition_id`, `row_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta from orc_kafka_table +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select `partition_id`, `row_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta from orc_kafka_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 0 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 1 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 2 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +PREHOOK: query: From kafka_table ktable JOIN kafka_table_offsets offset_table +on (ktable.`__partition` = offset_table.partition_id and ktable.`__offset` > offset_table.max_offset) +insert into table orc_kafka_table select `__partition`, `__offset`, `__timestamp`, +`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +Insert overwrite table kafka_table_offsets select +`__partition`, max(`__offset`), CURRENT_TIMESTAMP group by `__partition`, CURRENT_TIMESTAMP +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Input: default@kafka_table_offsets +PREHOOK: Output: default@kafka_table_offsets +PREHOOK: Output: default@orc_kafka_table +POSTHOOK: query: From kafka_table ktable JOIN kafka_table_offsets offset_table +on (ktable.`__partition` = offset_table.partition_id and ktable.`__offset` > offset_table.max_offset) +insert into table orc_kafka_table select `__partition`, `__offset`, `__timestamp`, +`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +Insert overwrite table kafka_table_offsets select +`__partition`, max(`__offset`), CURRENT_TIMESTAMP group by `__partition`, CURRENT_TIMESTAMP +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Input: default@kafka_table_offsets +POSTHOOK: Output: default@kafka_table_offsets +POSTHOOK: Output: default@orc_kafka_table +POSTHOOK: Lineage: kafka_table_offsets.insert_time EXPRESSION [] +POSTHOOK: Lineage: kafka_table_offsets.max_offset EXPRESSION [(kafka_table)ktable.FieldSchema(name:__offset, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: kafka_table_offsets.partition_id SIMPLE [(kafka_table)ktable.FieldSchema(name:__partition, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.__time SIMPLE [(kafka_table)ktable.FieldSchema(name:__time, type:timestamp, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.added SIMPLE [(kafka_table)ktable.FieldSchema(name:added, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.anonymous SIMPLE [(kafka_table)ktable.FieldSchema(name:anonymous, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.continent SIMPLE [(kafka_table)ktable.FieldSchema(name:continent, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.country SIMPLE [(kafka_table)ktable.FieldSchema(name:country, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.deleted SIMPLE [(kafka_table)ktable.FieldSchema(name:deleted, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.delta SIMPLE [(kafka_table)ktable.FieldSchema(name:delta, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.kafka_ts SIMPLE [(kafka_table)ktable.FieldSchema(name:__timestamp, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.language SIMPLE [(kafka_table)ktable.FieldSchema(name:language, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.namespace SIMPLE [(kafka_table)ktable.FieldSchema(name:namespace, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.newpage SIMPLE [(kafka_table)ktable.FieldSchema(name:newpage, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.page SIMPLE [(kafka_table)ktable.FieldSchema(name:page, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.partition_id SIMPLE [(kafka_table)ktable.FieldSchema(name:__partition, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.robot SIMPLE [(kafka_table)ktable.FieldSchema(name:robot, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.row_offset SIMPLE [(kafka_table)ktable.FieldSchema(name:__offset, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.unpatrolled SIMPLE [(kafka_table)ktable.FieldSchema(name:unpatrolled, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.user SIMPLE [(kafka_table)ktable.FieldSchema(name:user, type:string, comment:from deserializer), ] +PREHOOK: query: select partition_id, max_offset from kafka_table_offsets +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table_offsets +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select partition_id, max_offset from kafka_table_offsets +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table_offsets +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 9 +PREHOOK: query: select count(*) from orc_kafka_table +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(*) from orc_kafka_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +10 +PREHOOK: query: select `partition_id`, `row_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta from orc_kafka_table +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select `partition_id`, `row_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta from orc_kafka_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 0 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 1 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 2 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 3 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 4 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +0 5 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 6 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 7 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 8 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 9 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +PREHOOK: query: Drop table kafka_table_offsets +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@kafka_table_offsets +PREHOOK: Output: default@kafka_table_offsets +POSTHOOK: query: Drop table kafka_table_offsets +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@kafka_table_offsets +POSTHOOK: Output: default@kafka_table_offsets +PREHOOK: query: create table kafka_table_offsets(partition_id int, max_offset bigint, insert_time timestamp) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@kafka_table_offsets +POSTHOOK: query: create table kafka_table_offsets(partition_id int, max_offset bigint, insert_time timestamp) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@kafka_table_offsets +PREHOOK: query: Drop table orc_kafka_table +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@orc_kafka_table +PREHOOK: Output: default@orc_kafka_table +POSTHOOK: query: Drop table orc_kafka_table +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@orc_kafka_table +POSTHOOK: Output: default@orc_kafka_table +PREHOOK: query: Create table orc_kafka_table (partition_id int, row_offset bigint, kafka_ts bigint, + `__time` timestamp , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint +) stored as ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orc_kafka_table +POSTHOOK: query: Create table orc_kafka_table (partition_id int, row_offset bigint, kafka_ts bigint, + `__time` timestamp , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint +) stored as ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orc_kafka_table +PREHOOK: query: From kafka_table ktable LEFT OUTER JOIN kafka_table_offsets offset_table +on (ktable.`__partition` = offset_table.partition_id and ktable.`__offset` > offset_table.max_offset ) +insert into table orc_kafka_table select `__partition`, `__offset`, `__timestamp`, +`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +Insert overwrite table kafka_table_offsets select +`__partition`, max(`__offset`), CURRENT_TIMESTAMP group by `__partition`, CURRENT_TIMESTAMP +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table +PREHOOK: Input: default@kafka_table_offsets +PREHOOK: Output: default@kafka_table_offsets +PREHOOK: Output: default@orc_kafka_table +POSTHOOK: query: From kafka_table ktable LEFT OUTER JOIN kafka_table_offsets offset_table +on (ktable.`__partition` = offset_table.partition_id and ktable.`__offset` > offset_table.max_offset ) +insert into table orc_kafka_table select `__partition`, `__offset`, `__timestamp`, +`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +Insert overwrite table kafka_table_offsets select +`__partition`, max(`__offset`), CURRENT_TIMESTAMP group by `__partition`, CURRENT_TIMESTAMP +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table +POSTHOOK: Input: default@kafka_table_offsets +POSTHOOK: Output: default@kafka_table_offsets +POSTHOOK: Output: default@orc_kafka_table +POSTHOOK: Lineage: kafka_table_offsets.insert_time EXPRESSION [] +POSTHOOK: Lineage: kafka_table_offsets.max_offset EXPRESSION [(kafka_table)ktable.FieldSchema(name:__offset, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: kafka_table_offsets.partition_id SIMPLE [(kafka_table)ktable.FieldSchema(name:__partition, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.__time SIMPLE [(kafka_table)ktable.FieldSchema(name:__time, type:timestamp, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.added SIMPLE [(kafka_table)ktable.FieldSchema(name:added, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.anonymous SIMPLE [(kafka_table)ktable.FieldSchema(name:anonymous, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.continent SIMPLE [(kafka_table)ktable.FieldSchema(name:continent, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.country SIMPLE [(kafka_table)ktable.FieldSchema(name:country, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.deleted SIMPLE [(kafka_table)ktable.FieldSchema(name:deleted, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.delta SIMPLE [(kafka_table)ktable.FieldSchema(name:delta, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.kafka_ts SIMPLE [(kafka_table)ktable.FieldSchema(name:__timestamp, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.language SIMPLE [(kafka_table)ktable.FieldSchema(name:language, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.namespace SIMPLE [(kafka_table)ktable.FieldSchema(name:namespace, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.newpage SIMPLE [(kafka_table)ktable.FieldSchema(name:newpage, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.page SIMPLE [(kafka_table)ktable.FieldSchema(name:page, type:string, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.partition_id SIMPLE [(kafka_table)ktable.FieldSchema(name:__partition, type:int, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.robot SIMPLE [(kafka_table)ktable.FieldSchema(name:robot, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.row_offset SIMPLE [(kafka_table)ktable.FieldSchema(name:__offset, type:bigint, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.unpatrolled SIMPLE [(kafka_table)ktable.FieldSchema(name:unpatrolled, type:boolean, comment:from deserializer), ] +POSTHOOK: Lineage: orc_kafka_table.user SIMPLE [(kafka_table)ktable.FieldSchema(name:user, type:string, comment:from deserializer), ] +PREHOOK: query: select count(*) from orc_kafka_table +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(*) from orc_kafka_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +10 +PREHOOK: query: select partition_id, max_offset from kafka_table_offsets +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table_offsets +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select partition_id, max_offset from kafka_table_offsets +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table_offsets +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 9 +PREHOOK: query: select `partition_id`, `row_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta from orc_kafka_table +PREHOOK: type: QUERY +PREHOOK: Input: default@orc_kafka_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select `partition_id`, `row_offset`,`__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta from orc_kafka_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orc_kafka_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 0 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 1 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 2 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 3 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 4 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +0 5 NULL Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 6 NULL Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 7 NULL Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 8 NULL Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 9 NULL Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +PREHOOK: query: CREATE EXTERNAL TABLE kafka_table_2 +(`__time` timestamp with local time zone , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint) +STORED BY 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' +TBLPROPERTIES +("kafka.topic" = "test-topic", +"kafka.bootstrap.servers"="localhost:9092") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@kafka_table_2 +POSTHOOK: query: CREATE EXTERNAL TABLE kafka_table_2 +(`__time` timestamp with local time zone , `page` string, `user` string, `language` string, +`country` string,`continent` string, `namespace` string, `newPage` boolean, `unpatrolled` boolean, +`anonymous` boolean, `robot` boolean, added int, deleted int, delta bigint) +STORED BY 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' +TBLPROPERTIES +("kafka.topic" = "test-topic", +"kafka.bootstrap.servers"="localhost:9092") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@kafka_table_2 +PREHOOK: query: Select `__partition`, `__offset`, `__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +FROM kafka_table_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table_2 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select `__partition`, `__offset`, `__time`, `page`, `user`, `language`, `country`,`continent`, `namespace`, `newPage` , +`unpatrolled` , `anonymous` , `robot` , added , deleted , delta +FROM kafka_table_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table_2 +POSTHOOK: Output: hdfs://### HDFS PATH ### +0 0 2013-08-30 18:02:33.0 US/Pacific Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 1 2013-08-30 20:32:45.0 US/Pacific Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 2 2013-08-31 00:11:21.0 US/Pacific Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 3 2013-08-31 04:58:39.0 US/Pacific Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 4 2013-08-31 05:41:27.0 US/Pacific Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +0 5 2013-08-31 18:02:33.0 US/Pacific Gypsy Danger nuclear en United States North America article true true false false 57 200 -143 +0 6 2013-08-31 20:32:45.0 US/Pacific Striker Eureka speed en Australia Australia wikipedia true false false true 459 129 330 +0 7 2013-09-01 00:11:21.0 US/Pacific Cherno Alpha masterYi ru Russia Asia article true false false true 123 12 111 +0 8 2013-09-01 04:58:39.0 US/Pacific Crimson Typhoon triplets zh China Asia wikipedia false true false true 905 5 900 +0 9 2013-09-01 05:41:27.0 US/Pacific Coyote Tango stringer ja Japan Asia wikipedia false true false true 1 10 -9 +PREHOOK: query: Select count(*) FROM kafka_table_2 +PREHOOK: type: QUERY +PREHOOK: Input: default@kafka_table_2 +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: Select count(*) FROM kafka_table_2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@kafka_table_2 +POSTHOOK: Output: hdfs://### HDFS PATH ### +10 +PREHOOK: query: CREATE EXTERNAL TABLE wiki_kafka_avro_table +STORED BY 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' +TBLPROPERTIES +("kafka.topic" = "wiki_kafka_avro_table", +"kafka.bootstrap.servers"="localhost:9092", +"kafka.serde.class"="org.apache.hadoop.hive.serde2.avro.AvroSerDe", +'avro.schema.literal'='{ + "type" : "record", + "name" : "Wikipedia", + "namespace" : "org.apache.hive.kafka", + "version": "1", + "fields" : [ { + "name" : "isrobot", + "type" : "boolean" + }, { + "name" : "channel", + "type" : "string" + }, { + "name" : "timestamp", + "type" : "string" + }, { + "name" : "flags", + "type" : "string" + }, { + "name" : "isunpatrolled", + "type" : "boolean" + }, { + "name" : "page", + "type" : "string" + }, { + "name" : "diffurl", + "type" : "string" + }, { + "name" : "added", + "type" : "long" + }, { + "name" : "comment", + "type" : "string" + }, { + "name" : "commentlength", + "type" : "long" + }, { + "name" : "isnew", + "type" : "boolean" + }, { + "name" : "isminor", + "type" : "boolean" + }, { + "name" : "delta", + "type" : "long" + }, { + "name" : "isanonymous", + "type" : "boolean" + }, { + "name" : "user", + "type" : "string" + }, { + "name" : "deltabucket", + "type" : "double" + }, { + "name" : "deleted", + "type" : "long" + }, { + "name" : "namespace", + "type" : "string" + } ] +}' +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@wiki_kafka_avro_table +POSTHOOK: query: CREATE EXTERNAL TABLE wiki_kafka_avro_table +STORED BY 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' +TBLPROPERTIES +("kafka.topic" = "wiki_kafka_avro_table", +"kafka.bootstrap.servers"="localhost:9092", +"kafka.serde.class"="org.apache.hadoop.hive.serde2.avro.AvroSerDe", +'avro.schema.literal'='{ + "type" : "record", + "name" : "Wikipedia", + "namespace" : "org.apache.hive.kafka", + "version": "1", + "fields" : [ { + "name" : "isrobot", + "type" : "boolean" + }, { + "name" : "channel", + "type" : "string" + }, { + "name" : "timestamp", + "type" : "string" + }, { + "name" : "flags", + "type" : "string" + }, { + "name" : "isunpatrolled", + "type" : "boolean" + }, { + "name" : "page", + "type" : "string" + }, { + "name" : "diffurl", + "type" : "string" + }, { + "name" : "added", + "type" : "long" + }, { + "name" : "comment", + "type" : "string" + }, { + "name" : "commentlength", + "type" : "long" + }, { + "name" : "isnew", + "type" : "boolean" + }, { + "name" : "isminor", + "type" : "boolean" + }, { + "name" : "delta", + "type" : "long" + }, { + "name" : "isanonymous", + "type" : "boolean" + }, { + "name" : "user", + "type" : "string" + }, { + "name" : "deltabucket", + "type" : "double" + }, { + "name" : "deleted", + "type" : "long" + }, { + "name" : "namespace", + "type" : "string" + } ] +}' +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@wiki_kafka_avro_table +PREHOOK: query: describe extended wiki_kafka_avro_table +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@wiki_kafka_avro_table +POSTHOOK: query: describe extended wiki_kafka_avro_table +POSTHOOK: type: DESCTABLE +POSTHOOK: Input: default@wiki_kafka_avro_table +isrobot boolean from deserializer +channel string from deserializer +timestamp string from deserializer +flags string from deserializer +isunpatrolled boolean from deserializer +page string from deserializer +diffurl string from deserializer +added bigint from deserializer +comment string from deserializer +commentlength bigint from deserializer +isnew boolean from deserializer +isminor boolean from deserializer +delta bigint from deserializer +isanonymous boolean from deserializer +user string from deserializer +deltabucket double from deserializer +deleted bigint from deserializer +namespace string from deserializer +__partition int from deserializer +__offset bigint from deserializer +__timestamp bigint from deserializer +__start_offset bigint from deserializer +__end_offset bigint from deserializer + +#### A masked pattern was here #### +PREHOOK: query: select cast ((`__timestamp`/1000) as timestamp) as kafka_record_ts, `__partition`, `__offset`, `timestamp`, `user`, `page`, `deleted`, `deltabucket`, `isanonymous`, `commentlength` from wiki_kafka_avro_table +PREHOOK: type: QUERY +PREHOOK: Input: default@wiki_kafka_avro_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select cast ((`__timestamp`/1000) as timestamp) as kafka_record_ts, `__partition`, `__offset`, `timestamp`, `user`, `page`, `deleted`, `deltabucket`, `isanonymous`, `commentlength` from wiki_kafka_avro_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@wiki_kafka_avro_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +2018-08-20 03:37:05.09 0 0 08/19/2018 20:37:05 test-user-0 page is 0 0 0.0 false 0 +2018-08-20 04:37:05.09 0 1 08/19/2018 21:37:05 test-user-1 page is 100 -1 100.4 true 1 +2018-08-20 05:37:05.09 0 2 08/19/2018 22:37:05 test-user-2 page is 200 -2 200.8 true 2 +2018-08-20 06:37:05.09 0 3 08/19/2018 23:37:05 test-user-3 page is 300 -3 301.20000000000005 false 3 +2018-08-20 07:37:05.09 0 4 08/20/2018 00:37:05 test-user-4 page is 400 -4 401.6 true 4 +2018-08-20 08:37:05.09 0 5 08/20/2018 01:37:05 test-user-5 page is 500 -5 502.0 true 5 +2018-08-20 09:37:05.09 0 6 08/20/2018 02:37:05 test-user-6 page is 600 -6 602.4000000000001 false 6 +2018-08-20 10:37:05.09 0 7 08/20/2018 03:37:05 test-user-7 page is 700 -7 702.8000000000001 true 7 +2018-08-20 11:37:05.09 0 8 08/20/2018 04:37:05 test-user-8 page is 800 -8 803.2 true 8 +2018-08-20 12:37:05.09 0 9 08/20/2018 05:37:05 test-user-9 page is 900 -9 903.6 false 9 +2018-08-20 13:37:05.09 0 10 08/20/2018 06:37:05 test-user-10 page is 1000 -10 1004.0 true 10 +PREHOOK: query: select count(*) from wiki_kafka_avro_table +PREHOOK: type: QUERY +PREHOOK: Input: default@wiki_kafka_avro_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(*) from wiki_kafka_avro_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@wiki_kafka_avro_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +11 +PREHOOK: query: select count(distinct `user`) from wiki_kafka_avro_table +PREHOOK: type: QUERY +PREHOOK: Input: default@wiki_kafka_avro_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select count(distinct `user`) from wiki_kafka_avro_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@wiki_kafka_avro_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +11 +PREHOOK: query: select sum(deltabucket), min(commentlength) from wiki_kafka_avro_table +PREHOOK: type: QUERY +PREHOOK: Input: default@wiki_kafka_avro_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select sum(deltabucket), min(commentlength) from wiki_kafka_avro_table +POSTHOOK: type: QUERY +POSTHOOK: Input: default@wiki_kafka_avro_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +5522.000000000001 0 +PREHOOK: query: select cast ((`__timestamp`/1000) as timestamp) as kafka_record_ts, `__timestamp` as kafka_record_ts_long, +`__partition`, `__start_offset`,`__end_offset`,`__offset`, `timestamp`, `user`, `page`, `deleted`, `deltabucket`, +`isanonymous`, `commentlength` from wiki_kafka_avro_table where `__timestamp` > 1534750625090 +PREHOOK: type: QUERY +PREHOOK: Input: default@wiki_kafka_avro_table +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select cast ((`__timestamp`/1000) as timestamp) as kafka_record_ts, `__timestamp` as kafka_record_ts_long, +`__partition`, `__start_offset`,`__end_offset`,`__offset`, `timestamp`, `user`, `page`, `deleted`, `deltabucket`, +`isanonymous`, `commentlength` from wiki_kafka_avro_table where `__timestamp` > 1534750625090 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@wiki_kafka_avro_table +POSTHOOK: Output: hdfs://### HDFS PATH ### +2018-08-20 08:37:05.09 1534754225090 0 5 11 5 08/20/2018 01:37:05 test-user-5 page is 500 -5 502.0 true 5 +2018-08-20 09:37:05.09 1534757825090 0 5 11 6 08/20/2018 02:37:05 test-user-6 page is 600 -6 602.4000000000001 false 6 +2018-08-20 10:37:05.09 1534761425090 0 5 11 7 08/20/2018 03:37:05 test-user-7 page is 700 -7 702.8000000000001 true 7 +2018-08-20 11:37:05.09 1534765025090 0 5 11 8 08/20/2018 04:37:05 test-user-8 page is 800 -8 803.2 true 8 +2018-08-20 12:37:05.09 1534768625090 0 5 11 9 08/20/2018 05:37:05 test-user-9 page is 900 -9 903.6 false 9 +2018-08-20 13:37:05.09 1534772225090 0 5 11 10 08/20/2018 06:37:05 test-user-10 page is 1000 -10 1004.0 true 10 diff --git serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroLazyObjectInspector.java serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroLazyObjectInspector.java index e1e4d8bc5e..cd37bcc7ef 100644 --- serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroLazyObjectInspector.java +++ serde/src/java/org/apache/hadoop/hive/serde2/avro/AvroLazyObjectInspector.java @@ -386,7 +386,7 @@ private Object toLazyObject(Object field, ObjectInspector fieldOI) { * Convert the given object to a lazy object using the given {@link ObjectInspector} * * @param obj Object to be converted to a {@link LazyObject} - * @param oi ObjectInspector used for the conversion + * @param objectInspector ObjectInspector used for the conversion * @return the created {@link LazyObject lazy object} * */ private Object toLazyListObject(Object obj, ObjectInspector objectInspector) { @@ -414,7 +414,7 @@ private Object toLazyListObject(Object obj, ObjectInspector objectInspector) { * Convert the given object to a lazy object using the given {@link ObjectInspector} * * @param obj Object to be converted to a {@link LazyObject} - * @param oi ObjectInspector used for the conversion + * @param objectInspector ObjectInspector used for the conversion * @return the created {@link LazyObject lazy object} * */ @SuppressWarnings({ "rawtypes", "unchecked" }) @@ -451,7 +451,7 @@ private Object toLazyMapObject(Object obj, ObjectInspector objectInspector) { * Convert the given object to a lazy object using the given {@link ObjectInspector} * * @param obj Object to be converted to a {@link LazyObject} - * @param oi ObjectInspector used for the conversion + * @param objectInspector ObjectInspector used for the conversion * @return the created {@link LazyObject lazy object} * */ private Object toLazyUnionObject(Object obj, ObjectInspector objectInspector) {