use of org.apache.parquet.compression.CompressionCodecFactory in project drill by apache.
the class AbstractParquetScanBatchCreator method createReaderAndImplicitColumns.
/**
* Create a reader and add it to the list of readers.
*
* @param context The fragment context
* @param rowGroupScan RowGroup Scan
* @param oContext Operator context
* @param columnExplorer The column helper class object
* @param readers the readers' list where a new reader is added to
* @param implicitColumns the implicit columns list
* @param mapWithMaxColumns To be modified, in case there are implicit columns
* @param rowGroup create a reader for this specific row group
* @param fs file system
* @param footer this file's footer
* @param readSchemaOnly if true sets the number of rows to read to be zero
* @return the (possibly modified) input mapWithMaxColumns
*/
private Map<String, String> createReaderAndImplicitColumns(ExecutorFragmentContext context, AbstractParquetRowGroupScan rowGroupScan, OperatorContext oContext, ColumnExplorer columnExplorer, List<CommonParquetRecordReader> readers, List<Map<String, String>> implicitColumns, Map<String, String> mapWithMaxColumns, RowGroupReadEntry rowGroup, DrillFileSystem fs, ParquetMetadata footer, boolean readSchemaOnly) {
ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig();
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates());
logger.debug("Contains corrupt dates: {}.", containsCorruptDates);
boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER);
boolean containsComplexColumn = ParquetReaderUtility.containsComplexColumn(footer, rowGroupScan.getColumns());
logger.debug("PARQUET_NEW_RECORD_READER is {}. Complex columns {}.", useNewReader ? "enabled" : "disabled", containsComplexColumn ? "found." : "not found.");
// if readSchemaOnly - then set to zero rows to read
long recordsToRead = readSchemaOnly ? 0 : rowGroup.getNumRecordsToRead();
CommonParquetRecordReader reader;
if (useNewReader || containsComplexColumn) {
reader = new DrillParquetReader(context, footer, rowGroup, columnExplorer.getTableColumns(), fs, containsCorruptDates, recordsToRead);
} else {
CompressionCodecFactory ccf = DrillCompressionCodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0);
reader = new ParquetRecordReader(context, rowGroup.getPath(), rowGroup.getRowGroupIndex(), recordsToRead, fs, ccf, footer, rowGroupScan.getColumns(), containsCorruptDates);
}
logger.debug("Query {} uses {}", QueryIdHelper.getQueryId(oContext.getFragmentContext().getHandle().getQueryId()), reader.getClass().getSimpleName());
readers.add(reader);
List<String> partitionValues = rowGroupScan.getPartitionValues(rowGroup);
Map<String, String> implicitValues = columnExplorer.populateColumns(rowGroup.getPath(), partitionValues, rowGroupScan.supportsFileImplicitColumns(), fs, rowGroup.getRowGroupIndex(), rowGroup.getStart(), rowGroup.getLength());
implicitColumns.add(implicitValues);
if (implicitValues.size() > mapWithMaxColumns.size()) {
mapWithMaxColumns = implicitValues;
}
return mapWithMaxColumns;
}
use of org.apache.parquet.compression.CompressionCodecFactory in project drill by apache.
the class DrillCompressionCodecFactory method getCompressor.
@Override
public synchronized BytesInputCompressor getCompressor(CompressionCodecName codecName) {
if (AIRCOMPRESSOR_CODECS.contains(codecName)) {
return airCompressors.computeIfAbsent(codecName, c -> new AirliftBytesInputCompressor(codecName, allocator));
} else {
// Work around PARQUET-2126: construct a new codec factory every time to
// avoid a concurrrency bug c.f. DRILL-8139. Fortunately, constructing
// and releasing codec factories appears to be light weight.
CompressionCodecFactory ccf = CodecFactory.createDirectCodecFactory(config, allocator, pageSize);
// hold onto a reference for later release()
singleUseFactories.add(ccf);
return ccf.getCompressor(codecName);
// TODO: replace the above with the below PARQUET-2126 is fixed
// return parqCodecFactory.getDecompressor(codecName);
}
}
use of org.apache.parquet.compression.CompressionCodecFactory in project drill by apache.
the class DrillParquetReader method setup.
@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
try {
this.operatorContext = context;
schema = footer.getFileMetaData().getSchema();
MessageType projection;
final List<SchemaPath> columnsNotFound = new ArrayList<>(getColumns().size());
if (isStarQuery()) {
projection = schema;
} else {
projection = getProjection(schema, getColumns(), columnsNotFound);
if (projection == null) {
projection = schema;
}
if (!columnsNotFound.isEmpty()) {
nullFilledVectors = new ArrayList<>(columnsNotFound.size());
for (SchemaPath col : columnsNotFound) {
// col.toExpr() is used here as field name since we don't want to see these fields in the existing maps
nullFilledVectors.add(output.addField(MaterializedField.create(col.toExpr(), OPTIONAL_INT), NullableIntVector.class));
}
noColumnsFound = columnsNotFound.size() == getColumns().size();
}
}
logger.debug("Requesting schema {}", projection);
if (!noColumnsFound) {
// Discard the columns not found in the schema when create DrillParquetRecordMaterializer, since they have been added to output already.
@SuppressWarnings("unchecked") Collection<SchemaPath> columns = columnsNotFound.isEmpty() ? getColumns() : CollectionUtils.subtract(getColumns(), columnsNotFound);
recordMaterializer = new DrillParquetRecordMaterializer(output, projection, columns, fragmentContext.getOptions(), containsCorruptedDates);
}
if (numRecordsToRead == 0 || noColumnsFound) {
// no need to init readers
return;
}
ColumnIOFactory factory = new ColumnIOFactory(false);
MessageColumnIO columnIO = factory.getColumnIO(projection, schema);
BlockMetaData blockMetaData = footer.getBlocks().get(entry.getRowGroupIndex());
Map<ColumnPath, ColumnChunkMetaData> paths = blockMetaData.getColumns().stream().collect(Collectors.toMap(ColumnChunkMetaData::getPath, Function.identity(), (o, n) -> n));
BufferAllocator allocator = operatorContext.getAllocator();
CompressionCodecFactory ccf = DrillCompressionCodecFactory.createDirectCodecFactory(drillFileSystem.getConf(), new ParquetDirectByteBufferAllocator(allocator), 0);
pageReadStore = new ColumnChunkIncReadStore(numRecordsToRead, ccf, allocator, drillFileSystem, entry.getPath());
for (String[] path : schema.getPaths()) {
Type type = schema.getType(path);
if (type.isPrimitive()) {
ColumnChunkMetaData md = paths.get(ColumnPath.get(path));
pageReadStore.addColumn(schema.getColumnDescription(path), md);
}
}
recordReader = columnIO.getRecordReader(pageReadStore, recordMaterializer);
} catch (Exception e) {
throw handleAndRaise("Failure in setting up reader", e);
}
}
use of org.apache.parquet.compression.CompressionCodecFactory in project drill by apache.
the class ParquetRecordReaderTest method testPerformance.
@Test
@Ignore
public void testPerformance() throws Exception {
final DrillbitContext bitContext = mock(DrillbitContext.class);
final UserClientConnection connection = mock(UserClientConnection.class);
final DrillConfig c = DrillConfig.create();
final FunctionImplementationRegistry registry = new FunctionImplementationRegistry(c);
final FragmentContextImpl context = new FragmentContextImpl(bitContext, BitControl.PlanFragment.getDefaultInstance(), connection, registry);
final Path fileName = new Path("/tmp/parquet_test_performance.parquet");
final HashMap<String, FieldInfo> fields = new HashMap<>();
final ParquetTestProperties props = new ParquetTestProperties(1, 20 * 1000 * 1000, DEFAULT_BYTES_PER_PAGE, fields);
populateFieldInfoMap(props);
final Configuration dfsConfig = new Configuration();
final List<Footer> footers = ParquetFileReader.readFooters(dfsConfig, fileName);
final Footer f = footers.iterator().next();
final List<SchemaPath> columns = Lists.newArrayList();
columns.add(new SchemaPath("_MAP.integer", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bigInt", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.f", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.d", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.b", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin2", ExpressionPosition.UNKNOWN));
int totalRowCount = 0;
final FileSystem fs = new CachedSingleFileSystem(fileName);
final BufferAllocator allocator = RootAllocatorFactory.newRoot(c);
for (int i = 0; i < 25; i++) {
CompressionCodecFactory ccf = DrillCompressionCodecFactory.createDirectCodecFactory(dfsConfig, new ParquetDirectByteBufferAllocator(allocator), 0);
final ParquetRecordReader rr = new ParquetRecordReader(context, fileName, 0, fs, ccf, f.getParquetMetadata(), columns, ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION);
final TestOutputMutator mutator = new TestOutputMutator(allocator);
rr.setup(null, mutator);
final Stopwatch watch = Stopwatch.createStarted();
int rowCount = 0;
while ((rowCount = rr.next()) > 0) {
totalRowCount += rowCount;
}
rr.close();
}
allocator.close();
}
use of org.apache.parquet.compression.CompressionCodecFactory in project drill by apache.
the class DrillCompressionCodecFactory method getDecompressor.
@Override
public synchronized BytesInputDecompressor getDecompressor(CompressionCodecName codecName) {
if (AIRCOMPRESSOR_CODECS.contains(codecName)) {
return airCompressors.computeIfAbsent(codecName, c -> new AirliftBytesInputCompressor(codecName, allocator));
} else {
// Work around PARQUET-2126: construct a new codec factory every time to
// avoid a concurrrency bug c.f. DRILL-8139. Fortunately, constructing
// and releasing codec factories appears to be light weight.
CompressionCodecFactory ccf = CodecFactory.createDirectCodecFactory(config, allocator, pageSize);
// hold onto a reference for later release()
singleUseFactories.add(ccf);
return ccf.getDecompressor(codecName);
// TODO: replace the above with the below PARQUET-2126 is fixed
// return parqCodecFactory.getDecompressor(codecName);
}
}
Aggregations