Search in sources :

Example 1 with RecordReader

use of org.apache.parquet.io.RecordReader in project drill by apache.

the class DrillParquetReader method setup.

@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
    try {
        this.operatorContext = context;
        schema = footer.getFileMetaData().getSchema();
        MessageType projection;
        final List<SchemaPath> columnsNotFound = new ArrayList<>(getColumns().size());
        if (isStarQuery()) {
            projection = schema;
        } else {
            projection = getProjection(schema, getColumns(), columnsNotFound);
            if (projection == null) {
                projection = schema;
            }
            if (!columnsNotFound.isEmpty()) {
                nullFilledVectors = new ArrayList<>(columnsNotFound.size());
                for (SchemaPath col : columnsNotFound) {
                    // col.toExpr() is used here as field name since we don't want to see these fields in the existing maps
                    nullFilledVectors.add(output.addField(MaterializedField.create(col.toExpr(), OPTIONAL_INT), NullableIntVector.class));
                }
                noColumnsFound = columnsNotFound.size() == getColumns().size();
            }
        }
        logger.debug("Requesting schema {}", projection);
        if (!noColumnsFound) {
            // Discard the columns not found in the schema when create DrillParquetRecordMaterializer, since they have been added to output already.
            @SuppressWarnings("unchecked") Collection<SchemaPath> columns = columnsNotFound.isEmpty() ? getColumns() : CollectionUtils.subtract(getColumns(), columnsNotFound);
            recordMaterializer = new DrillParquetRecordMaterializer(output, projection, columns, fragmentContext.getOptions(), containsCorruptedDates);
        }
        if (numRecordsToRead == 0 || noColumnsFound) {
            // no need to init readers
            return;
        }
        ColumnIOFactory factory = new ColumnIOFactory(false);
        MessageColumnIO columnIO = factory.getColumnIO(projection, schema);
        BlockMetaData blockMetaData = footer.getBlocks().get(entry.getRowGroupIndex());
        Map<ColumnPath, ColumnChunkMetaData> paths = blockMetaData.getColumns().stream().collect(Collectors.toMap(ColumnChunkMetaData::getPath, Function.identity(), (o, n) -> n));
        BufferAllocator allocator = operatorContext.getAllocator();
        CompressionCodecFactory ccf = DrillCompressionCodecFactory.createDirectCodecFactory(drillFileSystem.getConf(), new ParquetDirectByteBufferAllocator(allocator), 0);
        pageReadStore = new ColumnChunkIncReadStore(numRecordsToRead, ccf, allocator, drillFileSystem, entry.getPath());
        for (String[] path : schema.getPaths()) {
            Type type = schema.getType(path);
            if (type.isPrimitive()) {
                ColumnChunkMetaData md = paths.get(ColumnPath.get(path));
                pageReadStore.addColumn(schema.getColumnDescription(path), md);
            }
        }
        recordReader = columnIO.getRecordReader(pageReadStore, recordMaterializer);
    } catch (Exception e) {
        throw handleAndRaise("Failure in setting up reader", e);
    }
}
Also used : Arrays(java.util.Arrays) BufferAllocator(org.apache.drill.exec.memory.BufferAllocator) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) LoggerFactory(org.slf4j.LoggerFactory) OutputMutator(org.apache.drill.exec.physical.impl.OutputMutator) OperatorContext(org.apache.drill.exec.ops.OperatorContext) DrillFileSystem(org.apache.drill.exec.store.dfs.DrillFileSystem) PathSegment(org.apache.drill.common.expression.PathSegment) Map(java.util.Map) RowGroupReadEntry(org.apache.drill.exec.store.parquet.RowGroupReadEntry) Types(org.apache.parquet.schema.Types) ValueVector(org.apache.drill.exec.vector.ValueVector) GroupType(org.apache.parquet.schema.GroupType) Collection(java.util.Collection) SchemaPath(org.apache.drill.common.expression.SchemaPath) Set(java.util.Set) Collectors(java.util.stream.Collectors) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Preconditions(org.apache.drill.shaded.guava.com.google.common.base.Preconditions) Type(org.apache.parquet.schema.Type) ExecConstants(org.apache.drill.exec.ExecConstants) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) NullableIntVector(org.apache.drill.exec.vector.NullableIntVector) MaterializedField(org.apache.drill.exec.record.MaterializedField) Function(java.util.function.Function) CommonParquetRecordReader(org.apache.drill.exec.store.CommonParquetRecordReader) ArrayList(java.util.ArrayList) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) AllocationHelper(org.apache.drill.exec.vector.AllocationHelper) CollectionUtils(org.apache.commons.collections.CollectionUtils) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) DrillCompressionCodecFactory(org.apache.drill.exec.store.parquet.compression.DrillCompressionCodecFactory) LinkedList(java.util.LinkedList) LinkedHashSet(java.util.LinkedHashSet) FragmentContext(org.apache.drill.exec.ops.FragmentContext) Logger(org.slf4j.Logger) IOException(java.io.IOException) ColumnChunkIncReadStore(org.apache.parquet.hadoop.ColumnChunkIncReadStore) StringJoiner(java.util.StringJoiner) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) OPTIONAL_INT(org.apache.drill.common.types.Types.OPTIONAL_INT) RecordReader(org.apache.parquet.io.RecordReader) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ArrayList(java.util.ArrayList) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) OutOfMemoryException(org.apache.drill.exec.exception.OutOfMemoryException) ExecutionSetupException(org.apache.drill.common.exceptions.ExecutionSetupException) IOException(java.io.IOException) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) BufferAllocator(org.apache.drill.exec.memory.BufferAllocator) ParquetDirectByteBufferAllocator(org.apache.drill.exec.store.parquet.ParquetDirectByteBufferAllocator) NullableIntVector(org.apache.drill.exec.vector.NullableIntVector) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) DrillCompressionCodecFactory(org.apache.drill.exec.store.parquet.compression.DrillCompressionCodecFactory) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnChunkIncReadStore(org.apache.parquet.hadoop.ColumnChunkIncReadStore) MessageType(org.apache.parquet.schema.MessageType)

Example 2 with RecordReader

use of org.apache.parquet.io.RecordReader in project ignite by apache.

the class SparkModelParser method loadKMeansModel.

/**
 * Load K-Means model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment learningEnvironment
 */
private static Model loadKMeansModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    Vector[] centers = null;
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        while (null != (pages = r.readNextRowGroup())) {
            final int rows = (int) pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            centers = new DenseVector[rows];
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                // final int clusterIdx = g.getInteger(0, 0);
                Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0);
                final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0);
                centers[i] = new DenseVector(amountOfCoefficients);
                for (int j = 0; j < amountOfCoefficients; j++) {
                    double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0);
                    centers[i].set(j, coefficient);
                }
            }
        }
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return new KMeansModel(centers, new EuclideanDistance());
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) KMeansModel(org.apache.ignite.ml.clustering.kmeans.KMeansModel) Configuration(org.apache.hadoop.conf.Configuration) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) RecordReader(org.apache.parquet.io.RecordReader) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) IOException(java.io.IOException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) EuclideanDistance(org.apache.ignite.ml.math.distances.EuclideanDistance) PageReadStore(org.apache.parquet.column.page.PageReadStore) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector) MessageType(org.apache.parquet.schema.MessageType) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)

Example 3 with RecordReader

use of org.apache.parquet.io.RecordReader in project ignite by apache.

the class SparkModelParser method loadLinearSVMModel.

/**
 * Load SVM model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinearSVMModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                interceptor = readSVMInterceptor(g);
                coefficients = readSVMCoefficients(g);
            }
        }
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return new SVMLinearClassificationModel(coefficients, interceptor);
}
Also used : Path(org.apache.hadoop.fs.Path) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) Configuration(org.apache.hadoop.conf.Configuration) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) RecordReader(org.apache.parquet.io.RecordReader) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) IOException(java.io.IOException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) PageReadStore(org.apache.parquet.column.page.PageReadStore) SVMLinearClassificationModel(org.apache.ignite.ml.svm.SVMLinearClassificationModel) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector) MessageType(org.apache.parquet.schema.MessageType)

Example 4 with RecordReader

use of org.apache.parquet.io.RecordReader in project ignite by apache.

the class SparkModelParser method loadDecisionTreeModel.

/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadDecisionTreeModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, NodeData> nodes = new TreeMap<>();
        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                NodeData nodeData = extractNodeDataFromParquetRow(g);
                nodes.put(nodeData.id, nodeData);
            }
        }
        return buildDecisionTreeModel(nodes);
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) Configuration(org.apache.hadoop.conf.Configuration) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) RecordReader(org.apache.parquet.io.RecordReader) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) IOException(java.io.IOException) TreeMap(java.util.TreeMap) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) NodeData(org.apache.ignite.ml.tree.NodeData) PageReadStore(org.apache.parquet.column.page.PageReadStore) MessageType(org.apache.parquet.schema.MessageType)

Example 5 with RecordReader

use of org.apache.parquet.io.RecordReader in project ignite by apache.

the class SparkModelParser method loadLogRegModel.

/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLogRegModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                interceptor = readInterceptor(g);
                coefficients = readCoefficients(g);
            }
        }
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return new LogisticRegressionModel(coefficients, interceptor);
}
Also used : Path(org.apache.hadoop.fs.Path) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) Configuration(org.apache.hadoop.conf.Configuration) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) RecordReader(org.apache.parquet.io.RecordReader) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) LogisticRegressionModel(org.apache.ignite.ml.regressions.logistic.LogisticRegressionModel) IOException(java.io.IOException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) PageReadStore(org.apache.parquet.column.page.PageReadStore) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

IOException (java.io.IOException)8 Configuration (org.apache.hadoop.conf.Configuration)7 Path (org.apache.hadoop.fs.Path)7 PageReadStore (org.apache.parquet.column.page.PageReadStore)7 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)7 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)7 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)7 ColumnIOFactory (org.apache.parquet.io.ColumnIOFactory)7 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)7 RecordReader (org.apache.parquet.io.RecordReader)7 MessageType (org.apache.parquet.schema.MessageType)7 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)4 DenseVector (org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)4 ArrayList (java.util.ArrayList)3 TreeMap (java.util.TreeMap)3 NodeData (org.apache.ignite.ml.tree.NodeData)3 IgniteModel (org.apache.ignite.ml.IgniteModel)2 Arrays (java.util.Arrays)1 Collection (java.util.Collection)1 HashMap (java.util.HashMap)1