Since parquet library has been updated , we no longer need to filter the records returned from parquet library for null records , as now the library skips those :
from parquet-hadoop/src/main/java/parquet/hadoop/InternalParquetRecordReader.java
public boolean nextKeyValue() throws IOException, InterruptedException {
boolean recordFound = false;
while (!recordFound) {
// no more records left
if (current >= total)
try {
checkRead();
currentValue = recordReader.read();
current ++;
if (recordReader.shouldSkipCurrentRecord())
if (currentValue == null)
{ // only happens with FilteredRecordReader at end of block current = totalCountLoadedSoFar; if (DEBUG) LOG.debug("filtered record reader reached end of block"); continue; }
recordFound = true;
if (DEBUG) LOG.debug("read value: " + currentValue);
} catch (RuntimeException e)
}
return true;
}
- links to