Search in sources :

Example 1 with CompressionCodecFactory

use of org.apache.parquet.compression.CompressionCodecFactory in project drill by apache.

the class AbstractParquetScanBatchCreator method createReaderAndImplicitColumns.

/**
 *  Create a reader and add it to the list of readers.
 *
 * @param context The fragment context
 * @param rowGroupScan RowGroup Scan
 * @param oContext Operator context
 * @param columnExplorer The column helper class object
 * @param readers the readers' list where a new reader is added to
 * @param implicitColumns the implicit columns list
 * @param mapWithMaxColumns To be modified, in case there are implicit columns
 * @param rowGroup create a reader for this specific row group
 * @param fs file system
 * @param footer this file's footer
 * @param readSchemaOnly if true sets the number of rows to read to be zero
 * @return the (possibly modified) input mapWithMaxColumns
 */
private Map<String, String> createReaderAndImplicitColumns(ExecutorFragmentContext context, AbstractParquetRowGroupScan rowGroupScan, OperatorContext oContext, ColumnExplorer columnExplorer, List<CommonParquetRecordReader> readers, List<Map<String, String>> implicitColumns, Map<String, String> mapWithMaxColumns, RowGroupReadEntry rowGroup, DrillFileSystem fs, ParquetMetadata footer, boolean readSchemaOnly) {
    ParquetReaderConfig readerConfig = rowGroupScan.getReaderConfig();
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, rowGroupScan.getColumns(), readerConfig.autoCorrectCorruptedDates());
    logger.debug("Contains corrupt dates: {}.", containsCorruptDates);
    boolean useNewReader = context.getOptions().getBoolean(ExecConstants.PARQUET_NEW_RECORD_READER);
    boolean containsComplexColumn = ParquetReaderUtility.containsComplexColumn(footer, rowGroupScan.getColumns());
    logger.debug("PARQUET_NEW_RECORD_READER is {}. Complex columns {}.", useNewReader ? "enabled" : "disabled", containsComplexColumn ? "found." : "not found.");
    // if readSchemaOnly - then set to zero rows to read
    long recordsToRead = readSchemaOnly ? 0 : rowGroup.getNumRecordsToRead();
    CommonParquetRecordReader reader;
    if (useNewReader || containsComplexColumn) {
        reader = new DrillParquetReader(context, footer, rowGroup, columnExplorer.getTableColumns(), fs, containsCorruptDates, recordsToRead);
    } else {
        CompressionCodecFactory ccf = DrillCompressionCodecFactory.createDirectCodecFactory(fs.getConf(), new ParquetDirectByteBufferAllocator(oContext.getAllocator()), 0);
        reader = new ParquetRecordReader(context, rowGroup.getPath(), rowGroup.getRowGroupIndex(), recordsToRead, fs, ccf, footer, rowGroupScan.getColumns(), containsCorruptDates);
    }
    logger.debug("Query {} uses {}", QueryIdHelper.getQueryId(oContext.getFragmentContext().getHandle().getQueryId()), reader.getClass().getSimpleName());
    readers.add(reader);
    List<String> partitionValues = rowGroupScan.getPartitionValues(rowGroup);
    Map<String, String> implicitValues = columnExplorer.populateColumns(rowGroup.getPath(), partitionValues, rowGroupScan.supportsFileImplicitColumns(), fs, rowGroup.getRowGroupIndex(), rowGroup.getStart(), rowGroup.getLength());
    implicitColumns.add(implicitValues);
    if (implicitValues.size() > mapWithMaxColumns.size()) {
        mapWithMaxColumns = implicitValues;
    }
    return mapWithMaxColumns;
}
Also used : DrillParquetReader(org.apache.drill.exec.store.parquet2.DrillParquetReader) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) CommonParquetRecordReader(org.apache.drill.exec.store.CommonParquetRecordReader) CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) DrillCompressionCodecFactory(org.apache.drill.exec.store.parquet.compression.DrillCompressionCodecFactory) CommonParquetRecordReader(org.apache.drill.exec.store.CommonParquetRecordReader)

Example 2 with CompressionCodecFactory

use of org.apache.parquet.compression.CompressionCodecFactory in project drill by apache.

the class DrillCompressionCodecFactory method getCompressor.

@Override
public synchronized BytesInputCompressor getCompressor(CompressionCodecName codecName) {
    if (AIRCOMPRESSOR_CODECS.contains(codecName)) {
        return airCompressors.computeIfAbsent(codecName, c -> new AirliftBytesInputCompressor(codecName, allocator));
    } else {
        // Work around PARQUET-2126: construct a new codec factory every time to
        // avoid a concurrrency bug c.f. DRILL-8139.  Fortunately, constructing
        // and releasing codec factories appears to be light weight.
        CompressionCodecFactory ccf = CodecFactory.createDirectCodecFactory(config, allocator, pageSize);
        // hold onto a reference for later release()
        singleUseFactories.add(ccf);
        return ccf.getCompressor(codecName);
    // TODO: replace the above with the below PARQUET-2126 is fixed
    // return parqCodecFactory.getDecompressor(codecName);
    }
}
Also used : CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory)

Example 3 with CompressionCodecFactory

use of org.apache.parquet.compression.CompressionCodecFactory in project drill by apache.

the class DrillParquetReader method setup.

@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
    try {
        this.operatorContext = context;
        schema = footer.getFileMetaData().getSchema();
        MessageType projection;
        final List<SchemaPath> columnsNotFound = new ArrayList<>(getColumns().size());
        if (isStarQuery()) {
            projection = schema;
        } else {
            projection = getProjection(schema, getColumns(), columnsNotFound);
            if (projection == null) {
                projection = schema;
            }
            if (!columnsNotFound.isEmpty()) {
                nullFilledVectors = new ArrayList<>(columnsNotFound.size());
                for (SchemaPath col : columnsNotFound) {
                    // col.toExpr() is used here as field name since we don't want to see these fields in the existing maps
                    nullFilledVectors.add(output.addField(MaterializedField.create(col.toExpr(), OPTIONAL_INT), NullableIntVector.class));
                }
                noColumnsFound = columnsNotFound.size() == getColumns().size();
            }
        }
        logger.debug("Requesting schema {}", projection);
        if (!noColumnsFound) {
            // Discard the columns not found in the schema when create DrillParquetRecordMaterializer, since they have been added to output already.
            @SuppressWarnings("unchecked") Collection<SchemaPath> columns = columnsNotFound.isEmpty() ? getColumns() : CollectionUtils.subtract(getColumns(), columnsNotFound);
            recordMaterializer = new DrillParquetRecordMaterializer(output, projection, columns, fragmentContext.getOptions(), containsCorruptedDates);
        }
        if (numRecordsToRead == 0 || noColumnsFound) {
            // no need to init readers
            return;
        }
        ColumnIOFactory factory = new ColumnIOFactory(false);
        MessageColumnIO columnIO = factory.getColumnIO(projection, schema);
        BlockMetaData blockMetaData = footer.getBlocks().get(entry.getRowGroupIndex());
        Map<ColumnPath, ColumnChunkMetaData> paths = blockMetaData.getColumns().stream().collect(Collectors.toMap(ColumnChunkMetaData::getPath, Function.identity(), (o, n) -> n));
        BufferAllocator allocator = operatorContext.getAllocator();
        CompressionCodecFactory ccf = DrillCompressionCodecFactory.createDirectCodecFactory(drillFileSystem.getConf(), new ParquetDirectByteBufferAllocator(allocator), 0);
        pageReadStore = new ColumnChunkIncReadStore(numRecordsToRead, ccf, allocator, drillFileSystem, entry.getPath());
        for (String[] path : schema.getPaths()) {
            Type type = schema.getType(path);
            if (type.isPrimitive()) {
                ColumnChunkMetaData md = paths.get(ColumnPath.get(path));
                pageReadStore.addColumn(schema.getColumnDescription(path), md);
            }
        }
        recordReader = columnIO.getRecordReader(pageReadStore, recordMaterializer);
    } catch (Exception e) {
        throw handleAndRaise("Failure in setting up reader", e);
    }
}
Also used : Arrays(java.util.Arrays) BufferAllocator(org.apache.drill.exec.memory.BufferAllocator) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) LoggerFactory(org.slf4j.LoggerFactory) OutputMutator(org.apache.drill.exec.physical.impl.OutputMutator) OperatorContext(org.apache.drill.exec.ops.OperatorContext) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) PathSegment(org.apache.drill.common.expression.PathSegment) Map(java.util.Map) RowGroupReadEntry(org.apache.drill.exec.store.parquet.RowGroupReadEntry) Types(org.apache.parquet.schema.Types) ValueVector(org.apache.drill.exec.vector.ValueVector) GroupType(org.apache.parquet.schema.GroupType) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) Set(java.util.Set) Collectors(java.util.stream.Collectors) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) Type(org.apache.parquet.schema.Type) ExecConstants(org.apache.drill.exec.ExecConstants) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) NullableIntVector(org.apache.drill.exec.vector.NullableIntVector) MaterializedField(org.apache.drill.exec.record.MaterializedField) Function(java.util.function.Function) CommonParquetRecordReader(org.apache.drill.exec.store.CommonParquetRecordReader) ArrayList(java.util.ArrayList) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) AllocationHelper(org.apache.drill.exec.vector.AllocationHelper) CollectionUtils(org.apache.commons.collections.CollectionUtils) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) DrillCompressionCodecFactory(org.apache.drill.exec.store.parquet.compression.DrillCompressionCodecFactory) LinkedList(java.util.LinkedList) LinkedHashSet(java.util.LinkedHashSet) FragmentContext(org.apache.drill.exec.ops.FragmentContext) Logger(org.slf4j.Logger) IOException(java.io.IOException) ColumnChunkIncReadStore(org.apache.parquet.hadoop.ColumnChunkIncReadStore) StringJoiner(java.util.StringJoiner) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) OPTIONAL_INT(org.apache.drill.common.types.Types.OPTIONAL_INT) RecordReader(org.apache.parquet.io.RecordReader) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ArrayList(java.util.ArrayList) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) IOException(java.io.IOException) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) BufferAllocator(org.apache.drill.exec.memory.BufferAllocator) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) NullableIntVector(org.apache.drill.exec.vector.NullableIntVector) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) DrillCompressionCodecFactory(org.apache.drill.exec.store.parquet.compression.DrillCompressionCodecFactory) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnChunkIncReadStore(org.apache.parquet.hadoop.ColumnChunkIncReadStore) MessageType(org.apache.parquet.schema.MessageType)

Example 4 with CompressionCodecFactory

use of org.apache.parquet.compression.CompressionCodecFactory in project drill by apache.

the class ParquetRecordReaderTest method testPerformance.

@Test
@Ignore
public void testPerformance() throws Exception {
    final DrillbitContext bitContext = mock(DrillbitContext.class);
    final UserClientConnection connection = mock(UserClientConnection.class);
    final DrillConfig c = DrillConfig.create();
    final FunctionImplementationRegistry registry = new FunctionImplementationRegistry(c);
    final FragmentContextImpl context = new FragmentContextImpl(bitContext, BitControl.PlanFragment.getDefaultInstance(), connection, registry);
    final Path fileName = new Path("/tmp/parquet_test_performance.parquet");
    final HashMap<String, FieldInfo> fields = new HashMap<>();
    final ParquetTestProperties props = new ParquetTestProperties(1, 20 * 1000 * 1000, DEFAULT_BYTES_PER_PAGE, fields);
    populateFieldInfoMap(props);
    final Configuration dfsConfig = new Configuration();
    final List<Footer> footers = ParquetFileReader.readFooters(dfsConfig, fileName);
    final Footer f = footers.iterator().next();
    final List<SchemaPath> columns = Lists.newArrayList();
    columns.add(new SchemaPath("_MAP.integer", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.bigInt", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.f", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.d", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.b", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.bin", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.bin2", ExpressionPosition.UNKNOWN));
    int totalRowCount = 0;
    final FileSystem fs = new CachedSingleFileSystem(fileName);
    final BufferAllocator allocator = RootAllocatorFactory.newRoot(c);
    for (int i = 0; i < 25; i++) {
        CompressionCodecFactory ccf = DrillCompressionCodecFactory.createDirectCodecFactory(dfsConfig, new ParquetDirectByteBufferAllocator(allocator), 0);
        final ParquetRecordReader rr = new ParquetRecordReader(context, fileName, 0, fs, ccf, f.getParquetMetadata(), columns, ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION);
        final TestOutputMutator mutator = new TestOutputMutator(allocator);
        rr.setup(null, mutator);
        final Stopwatch watch = Stopwatch.createStarted();
        int rowCount = 0;
        while ((rowCount = rr.next()) > 0) {
            totalRowCount += rowCount;
        }
        rr.close();
    }
    allocator.close();
}
Also used : DrillbitContext(org.apache.drill.exec.server.DrillbitContext) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) FragmentContextImpl(org.apache.drill.exec.ops.FragmentContextImpl) TestOutputMutator(org.apache.drill.exec.store.TestOutputMutator) DrillConfig(org.apache.drill.common.config.DrillConfig) CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) DrillCompressionCodecFactory(org.apache.drill.exec.store.parquet.compression.DrillCompressionCodecFactory) SchemaPath(org.apache.drill.common.expression.SchemaPath) FileSystem(org.apache.hadoop.fs.FileSystem) CachedSingleFileSystem(org.apache.drill.exec.store.CachedSingleFileSystem) FunctionImplementationRegistry(org.apache.drill.exec.expr.fn.FunctionImplementationRegistry) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) BufferAllocator(org.apache.drill.exec.memory.BufferAllocator) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) CachedSingleFileSystem(org.apache.drill.exec.store.CachedSingleFileSystem) UserClientConnection(org.apache.drill.exec.rpc.UserClientConnection) Footer(org.apache.parquet.hadoop.Footer) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 5 with CompressionCodecFactory

use of org.apache.parquet.compression.CompressionCodecFactory in project drill by apache.

the class DrillCompressionCodecFactory method getDecompressor.

@Override
public synchronized BytesInputDecompressor getDecompressor(CompressionCodecName codecName) {
    if (AIRCOMPRESSOR_CODECS.contains(codecName)) {
        return airCompressors.computeIfAbsent(codecName, c -> new AirliftBytesInputCompressor(codecName, allocator));
    } else {
        // Work around PARQUET-2126: construct a new codec factory every time to
        // avoid a concurrrency bug c.f. DRILL-8139.  Fortunately, constructing
        // and releasing codec factories appears to be light weight.
        CompressionCodecFactory ccf = CodecFactory.createDirectCodecFactory(config, allocator, pageSize);
        // hold onto a reference for later release()
        singleUseFactories.add(ccf);
        return ccf.getDecompressor(codecName);
    // TODO: replace the above with the below PARQUET-2126 is fixed
    // return parqCodecFactory.getDecompressor(codecName);
    }
}
Also used : CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory)

Aggregations

CompressionCodecFactory (org.apache.parquet.compression.CompressionCodecFactory)5 DrillCompressionCodecFactory (org.apache.drill.exec.store.parquet.compression.DrillCompressionCodecFactory)3 SchemaPath (org.apache.drill.common.expression.SchemaPath)2 BufferAllocator (org.apache.drill.exec.memory.BufferAllocator)2 CommonParquetRecordReader (org.apache.drill.exec.store.CommonParquetRecordReader)2 ParquetRecordReader (org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collection (java.util.Collection)1 HashMap (java.util.HashMap)1 LinkedHashSet (java.util.LinkedHashSet)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 StringJoiner (java.util.StringJoiner)1 Function (java.util.function.Function)1 Collectors (java.util.stream.Collectors)1 CollectionUtils (org.apache.commons.collections.CollectionUtils)1