Search in sources :

Example 21 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project ignite by apache.

the class SparkModelParser method parseTreesForRandomForestAlgorithm.

/**
 * Parse trees from file for common Random Forest ensemble.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static List<IgniteModel<Vector, Double>> parseTreesForRandomForestAlgorithm(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                final int treeID = g.getInteger(0, 0);
                final SimpleGroup nodeDataGroup = (SimpleGroup) g.getGroup(1, 0);
                NodeData nodeData = extractNodeDataFromParquetRow(nodeDataGroup);
                if (nodesByTreeId.containsKey(treeID)) {
                    Map<Integer, NodeData> nodesByNodeId = nodesByTreeId.get(treeID);
                    nodesByNodeId.put(nodeData.id, nodeData);
                } else {
                    TreeMap<Integer, NodeData> nodesByNodeId = new TreeMap<>();
                    nodesByNodeId.put(nodeData.id, nodeData);
                    nodesByTreeId.put(treeID, nodesByNodeId);
                }
            }
        }
        List<IgniteModel<Vector, Double>> models = new ArrayList<>();
        nodesByTreeId.forEach((key, nodes) -> models.add(buildDecisionTreeModel(nodes)));
        return models;
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) Configuration(org.apache.hadoop.conf.Configuration) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) RecordReader(org.apache.parquet.io.RecordReader) ArrayList(java.util.ArrayList) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) IOException(java.io.IOException) TreeMap(java.util.TreeMap) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) NodeData(org.apache.ignite.ml.tree.NodeData) PageReadStore(org.apache.parquet.column.page.PageReadStore) IgniteModel(org.apache.ignite.ml.IgniteModel) MessageType(org.apache.parquet.schema.MessageType)

Example 22 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project ignite by apache.

the class SparkModelParser method parseAndBuildGDBModel.

/**
 * Parse and build common GDB model with the custom label mapper.
 *
 * @param pathToMdl Path to model.
 * @param pathToMdlMetaData Path to model meta data.
 * @param lbMapper Label mapper.
 * @param learningEnvironment learningEnvironment
 */
@Nullable
private static Model parseAndBuildGDBModel(String pathToMdl, String pathToMdlMetaData, IgniteFunction<Double, Double> lbMapper, LearningEnvironment learningEnvironment) {
    double[] treeWeights = null;
    final Map<Integer, Double> treeWeightsByTreeID = new HashMap<>();
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdlMetaData), new Configuration()))) {
        PageReadStore pagesMetaData;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        while (null != (pagesMetaData = r.readNextRowGroup())) {
            final long rows = pagesMetaData.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pagesMetaData, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                int treeId = g.getInteger(0, 0);
                double treeWeight = g.getDouble(2, 0);
                treeWeightsByTreeID.put(treeId, treeWeight);
            }
        }
    } catch (IOException e) {
        String msg = "Error reading parquet file with MetaData by the path: " + pathToMdlMetaData;
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    treeWeights = new double[treeWeightsByTreeID.size()];
    for (int i = 0; i < treeWeights.length; i++) treeWeights[i] = treeWeightsByTreeID.get(i);
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup) recordReader.read();
                final int treeID = g.getInteger(0, 0);
                final SimpleGroup nodeDataGroup = (SimpleGroup) g.getGroup(1, 0);
                NodeData nodeData = extractNodeDataFromParquetRow(nodeDataGroup);
                if (nodesByTreeId.containsKey(treeID)) {
                    Map<Integer, NodeData> nodesByNodeId = nodesByTreeId.get(treeID);
                    nodesByNodeId.put(nodeData.id, nodeData);
                } else {
                    TreeMap<Integer, NodeData> nodesByNodeId = new TreeMap<>();
                    nodesByNodeId.put(nodeData.id, nodeData);
                    nodesByTreeId.put(treeID, nodesByNodeId);
                }
            }
        }
        final List<IgniteModel<Vector, Double>> models = new ArrayList<>();
        nodesByTreeId.forEach((key, nodes) -> models.add(buildDecisionTreeModel(nodes)));
        return new GDBModel(models, new WeightedPredictionsAggregator(treeWeights), lbMapper);
    } catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) RecordReader(org.apache.parquet.io.RecordReader) ArrayList(java.util.ArrayList) GDBModel(org.apache.ignite.ml.composition.boosting.GDBModel) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) PageReadStore(org.apache.parquet.column.page.PageReadStore) MessageType(org.apache.parquet.schema.MessageType) Path(org.apache.hadoop.fs.Path) GroupRecordConverter(org.apache.parquet.example.data.simple.convert.GroupRecordConverter) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) WeightedPredictionsAggregator(org.apache.ignite.ml.composition.predictionsaggregator.WeightedPredictionsAggregator) IOException(java.io.IOException) TreeMap(java.util.TreeMap) ColumnIOFactory(org.apache.parquet.io.ColumnIOFactory) NodeData(org.apache.ignite.ml.tree.NodeData) IgniteModel(org.apache.ignite.ml.IgniteModel) Nullable(org.jetbrains.annotations.Nullable)

Example 23 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project flink by apache.

the class ParquetColumnarRowSplitReader method readNextRowGroup.

private void readNextRowGroup() throws IOException {
    PageReadStore pages = reader.readNextRowGroup();
    if (pages == null) {
        throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount);
    }
    List<ColumnDescriptor> columns = requestedSchema.getColumns();
    columnReaders = new AbstractColumnReader[columns.size()];
    for (int i = 0; i < columns.size(); ++i) {
        columnReaders[i] = createColumnReader(utcTimestamp, selectedTypes[i], columns.get(i), pages.getPageReader(columns.get(i)));
    }
    totalCountLoadedSoFar += pages.getRowCount();
}
Also used : PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) IOException(java.io.IOException)

Example 24 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestDataPageV1Checksums method testCompression.

/**
 * Tests that the checksum is calculated using the compressed version of the data and that
 * checksum verification succeeds
 */
@Test
public void testCompression() throws IOException {
    Configuration conf = new Configuration();
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
    Path path = writeSimpleParquetFile(conf, CompressionCodecName.SNAPPY);
    try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
        PageReadStore pageReadStore = reader.readNextRowGroup();
        DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
        assertCrcSetAndCorrect(colAPage1, snappy(colAPage1Bytes));
        assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);
        DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
        assertCrcSetAndCorrect(colAPage2, snappy(colAPage2Bytes));
        assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);
        DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
        assertCrcSetAndCorrect(colBPage1, snappy(colBPage1Bytes));
        assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);
        DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
        assertCrcSetAndCorrect(colBPage2, snappy(colBPage2Bytes));
        assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) PageReadStore(org.apache.parquet.column.page.PageReadStore) DataPageV1(org.apache.parquet.column.page.DataPageV1) Test(org.junit.Test)

Example 25 with PageReadStore

use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.

the class TestDataPageV1Checksums method testWriteOnVerifyOn.

/**
 * Write out checksums and verify them on the read path. Tests that crc is set and that we can
 * read back what we wrote if checksums are enabled on both the write and read path.
 */
@Test
public void testWriteOnVerifyOn() throws IOException {
    Configuration conf = new Configuration();
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
    Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
    try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
        PageReadStore pageReadStore = reader.readNextRowGroup();
        DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
        assertCrcSetAndCorrect(colAPage1, colAPage1Bytes);
        assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);
        DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
        assertCrcSetAndCorrect(colAPage2, colAPage2Bytes);
        assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);
        DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
        assertCrcSetAndCorrect(colBPage1, colBPage1Bytes);
        assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);
        DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
        assertCrcSetAndCorrect(colBPage2, colBPage2Bytes);
        assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) PageReadStore(org.apache.parquet.column.page.PageReadStore) DataPageV1(org.apache.parquet.column.page.DataPageV1) Test(org.junit.Test)

Aggregations

PageReadStore (org.apache.parquet.column.page.PageReadStore)31 Configuration (org.apache.hadoop.conf.Configuration)22 Path (org.apache.hadoop.fs.Path)22 IOException (java.io.IOException)14 MessageType (org.apache.parquet.schema.MessageType)14 Test (org.junit.Test)13 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)12 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)10 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)8 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)7 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)7 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 ColumnIOFactory (org.apache.parquet.io.ColumnIOFactory)7 RecordReader (org.apache.parquet.io.RecordReader)7 DataPageV1 (org.apache.parquet.column.page.DataPageV1)6 Encoding (org.apache.parquet.column.Encoding)5 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)5 File (java.io.File)4 List (java.util.List)4 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)4