Search in sources :

Example 6 with RecordReader

use of org.apache.parquet.io.RecordReader in project ignite by apache.

the class SparkModelParser method loadLinRegModel.

/**
 * Load linear regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinRegModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                interceptor = readLinRegInterceptor(g);
                coefficients = readLinRegCoefficients(g);
            }
        }
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return new LinearRegressionModel(coefficients, interceptor);
}
Also used : Path(org.apache.hadoop.fs.Path) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) Configuration(org.apache.hadoop.conf.Configuration) LinearRegressionModel(org.apache.ignite.ml.regressions.linear.LinearRegressionModel) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) RecordReader(org.apache.parquet.io.RecordReader) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) IOException(java.io.IOException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) PageReadStore(org.apache.parquet.column.page.PageReadStore) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector) MessageType(org.apache.parquet.schema.MessageType)

Example 7 with RecordReader

use of org.apache.parquet.io.RecordReader in project ignite by apache.

the class SparkModelParser method parseTreesForRandomForestAlgorithm.

/**
 * Parse trees from file for common Random Forest ensemble.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static List<IgniteModel<Vector, Double>> parseTreesForRandomForestAlgorithm(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                final int treeID = g.getInteger(0, 0);
                final SimpleGroup nodeDataGroup = (SimpleGroup) g.getGroup(1, 0);
                NodeData nodeData = extractNodeDataFromParquetRow(nodeDataGroup);
                if (nodesByTreeId.containsKey(treeID)) {
                    Map<Integer, NodeData> nodesByNodeId = nodesByTreeId.get(treeID);
                    nodesByNodeId.put(nodeData.id, nodeData);
                } else {
                    TreeMap<Integer, NodeData> nodesByNodeId = new TreeMap<>();
                    nodesByNodeId.put(nodeData.id, nodeData);
                    nodesByTreeId.put(treeID, nodesByNodeId);
                }
            }
        }
        List<IgniteModel<Vector, Double>> models = new ArrayList<>();
        nodesByTreeId.forEach((key, nodes) -> models.add(buildDecisionTreeModel(nodes)));
        return models;
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) Configuration(org.apache.hadoop.conf.Configuration) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) RecordReader(org.apache.parquet.io.RecordReader) ArrayList(java.util.ArrayList) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) IOException(java.io.IOException) TreeMap(java.util.TreeMap) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) NodeData(org.apache.ignite.ml.tree.NodeData) PageReadStore(org.apache.parquet.column.page.PageReadStore) IgniteModel(org.apache.ignite.ml.IgniteModel) MessageType(org.apache.parquet.schema.MessageType)

Example 8 with RecordReader

use of org.apache.parquet.io.RecordReader in project ignite by apache.

the class SparkModelParser method parseAndBuildGDBModel.

/**
 * Parse and build common GDB model with the custom label mapper.
 *
 * @param pathToMdl Path to model.
 * @param pathToMdlMetaData Path to model meta data.
 * @param lbMapper Label mapper.
 * @param learningEnvironment learningEnvironment
 */
@Nullable
private static Model parseAndBuildGDBModel(String pathToMdl, String pathToMdlMetaData, IgniteFunction<Double, Double> lbMapper, LearningEnvironment learningEnvironment) {
    double[] treeWeights = null;
    final Map<Integer, Double> treeWeightsByTreeID = new HashMap<>();
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdlMetaData), new Configuration()))) {
        PageReadStore pagesMetaData;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        while (null != (pagesMetaData = r.readNextRowGroup())) {
            final long rows = pagesMetaData.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pagesMetaData, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                int treeId = g.getInteger(0, 0);
                double treeWeight = g.getDouble(2, 0);
                treeWeightsByTreeID.put(treeId, treeWeight);
            }
        }
    } catch (IOException e) {
        String msg = "Error reading parquet file with MetaData by the path: " + pathToMdlMetaData;
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    treeWeights = new double[treeWeightsByTreeID.size()];
    for (int i = 0; i < treeWeights.length; i++) treeWeights[i] = treeWeightsByTreeID.get(i);
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                final int treeID = g.getInteger(0, 0);
                final SimpleGroup nodeDataGroup = (SimpleGroup) g.getGroup(1, 0);
                NodeData nodeData = extractNodeDataFromParquetRow(nodeDataGroup);
                if (nodesByTreeId.containsKey(treeID)) {
                    Map<Integer, NodeData> nodesByNodeId = nodesByTreeId.get(treeID);
                    nodesByNodeId.put(nodeData.id, nodeData);
                } else {
                    TreeMap<Integer, NodeData> nodesByNodeId = new TreeMap<>();
                    nodesByNodeId.put(nodeData.id, nodeData);
                    nodesByTreeId.put(treeID, nodesByNodeId);
                }
            }
        }
        final List<IgniteModel<Vector, Double>> models = new ArrayList<>();
        nodesByTreeId.forEach((key, nodes) -> models.add(buildDecisionTreeModel(nodes)));
        return new GDBModel(models, new WeightedPredictionsAggregator(treeWeights), lbMapper);
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) RecordReader(org.apache.parquet.io.RecordReader) ArrayList(java.util.ArrayList) GDBModel(org.apache.ignite.ml.composition.boosting.GDBModel) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) PageReadStore(org.apache.parquet.column.page.PageReadStore) MessageType(org.apache.parquet.schema.MessageType) Path(org.apache.hadoop.fs.Path) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) WeightedPredictionsAggregator(org.apache.ignite.ml.composition.predictionsaggregator.WeightedPredictionsAggregator) IOException(java.io.IOException) TreeMap(java.util.TreeMap) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) NodeData(org.apache.ignite.ml.tree.NodeData) IgniteModel(org.apache.ignite.ml.IgniteModel) Nullable(org.jetbrains.annotations.Nullable)

Aggregations

IOException (java.io.IOException)8 Configuration (org.apache.hadoop.conf.Configuration)7 Path (org.apache.hadoop.fs.Path)7 PageReadStore (org.apache.parquet.column.page.PageReadStore)7 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)7 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)7 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)7 ColumnIOFactory (org.apache.parquet.io.ColumnIOFactory)7 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)7 RecordReader (org.apache.parquet.io.RecordReader)7 MessageType (org.apache.parquet.schema.MessageType)7 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)4 DenseVector (org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)4 ArrayList (java.util.ArrayList)3 TreeMap (java.util.TreeMap)3 NodeData (org.apache.ignite.ml.tree.NodeData)3 IgniteModel (org.apache.ignite.ml.IgniteModel)2 Arrays (java.util.Arrays)1 Collection (java.util.Collection)1 HashMap (java.util.HashMap)1