use of org.apache.parquet.column.page.PageReadStore in project ignite by apache.
the class SparkModelParser method parseTreesForRandomForestAlgorithm.
/**
* Parse trees from file for common Random Forest ensemble.
*
* @param pathToMdl Path to model.
* @param learningEnvironment Learning environment.
*/
private static List<IgniteModel<Vector, Double>> parseTreesForRandomForestAlgorithm(String pathToMdl, LearningEnvironment learningEnvironment) {
try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
PageReadStore pages;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
final SimpleGroup g = (SimpleGroup) recordReader.read();
final int treeID = g.getInteger(0, 0);
final SimpleGroup nodeDataGroup = (SimpleGroup) g.getGroup(1, 0);
NodeData nodeData = extractNodeDataFromParquetRow(nodeDataGroup);
if (nodesByTreeId.containsKey(treeID)) {
Map<Integer, NodeData> nodesByNodeId = nodesByTreeId.get(treeID);
nodesByNodeId.put(nodeData.id, nodeData);
} else {
TreeMap<Integer, NodeData> nodesByNodeId = new TreeMap<>();
nodesByNodeId.put(nodeData.id, nodeData);
nodesByTreeId.put(treeID, nodesByNodeId);
}
}
}
List<IgniteModel<Vector, Double>> models = new ArrayList<>();
nodesByTreeId.forEach((key, nodes) -> models.add(buildDecisionTreeModel(nodes)));
return models;
} catch (IOException e) {
String msg = "Error reading parquet file: " + e.getMessage();
learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
e.printStackTrace();
}
return null;
}
use of org.apache.parquet.column.page.PageReadStore in project ignite by apache.
the class SparkModelParser method parseAndBuildGDBModel.
/**
* Parse and build common GDB model with the custom label mapper.
*
* @param pathToMdl Path to model.
* @param pathToMdlMetaData Path to model meta data.
* @param lbMapper Label mapper.
* @param learningEnvironment learningEnvironment
*/
@Nullable
private static Model parseAndBuildGDBModel(String pathToMdl, String pathToMdlMetaData, IgniteFunction<Double, Double> lbMapper, LearningEnvironment learningEnvironment) {
double[] treeWeights = null;
final Map<Integer, Double> treeWeightsByTreeID = new HashMap<>();
try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdlMetaData), new Configuration()))) {
PageReadStore pagesMetaData;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
while (null != (pagesMetaData = r.readNextRowGroup())) {
final long rows = pagesMetaData.getRowCount();
final RecordReader recordReader = colIO.getRecordReader(pagesMetaData, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
final SimpleGroup g = (SimpleGroup) recordReader.read();
int treeId = g.getInteger(0, 0);
double treeWeight = g.getDouble(2, 0);
treeWeightsByTreeID.put(treeId, treeWeight);
}
}
} catch (IOException e) {
String msg = "Error reading parquet file with MetaData by the path: " + pathToMdlMetaData;
learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
e.printStackTrace();
}
treeWeights = new double[treeWeightsByTreeID.size()];
for (int i = 0; i < treeWeights.length; i++) treeWeights[i] = treeWeightsByTreeID.get(i);
try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
PageReadStore pages;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
final SimpleGroup g = (SimpleGroup) recordReader.read();
final int treeID = g.getInteger(0, 0);
final SimpleGroup nodeDataGroup = (SimpleGroup) g.getGroup(1, 0);
NodeData nodeData = extractNodeDataFromParquetRow(nodeDataGroup);
if (nodesByTreeId.containsKey(treeID)) {
Map<Integer, NodeData> nodesByNodeId = nodesByTreeId.get(treeID);
nodesByNodeId.put(nodeData.id, nodeData);
} else {
TreeMap<Integer, NodeData> nodesByNodeId = new TreeMap<>();
nodesByNodeId.put(nodeData.id, nodeData);
nodesByTreeId.put(treeID, nodesByNodeId);
}
}
}
final List<IgniteModel<Vector, Double>> models = new ArrayList<>();
nodesByTreeId.forEach((key, nodes) -> models.add(buildDecisionTreeModel(nodes)));
return new GDBModel(models, new WeightedPredictionsAggregator(treeWeights), lbMapper);
} catch (IOException e) {
String msg = "Error reading parquet file: " + e.getMessage();
learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
e.printStackTrace();
}
return null;
}
use of org.apache.parquet.column.page.PageReadStore in project flink by apache.
the class ParquetColumnarRowSplitReader method readNextRowGroup.
private void readNextRowGroup() throws IOException {
PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount);
}
List<ColumnDescriptor> columns = requestedSchema.getColumns();
columnReaders = new AbstractColumnReader[columns.size()];
for (int i = 0; i < columns.size(); ++i) {
columnReaders[i] = createColumnReader(utcTimestamp, selectedTypes[i], columns.get(i), pages.getPageReader(columns.get(i)));
}
totalCountLoadedSoFar += pages.getRowCount();
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestDataPageV1Checksums method testCompression.
/**
* Tests that the checksum is calculated using the compressed version of the data and that
* checksum verification succeeds
*/
@Test
public void testCompression() throws IOException {
Configuration conf = new Configuration();
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
Path path = writeSimpleParquetFile(conf, CompressionCodecName.SNAPPY);
try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
PageReadStore pageReadStore = reader.readNextRowGroup();
DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
assertCrcSetAndCorrect(colAPage1, snappy(colAPage1Bytes));
assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);
DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
assertCrcSetAndCorrect(colAPage2, snappy(colAPage2Bytes));
assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);
DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
assertCrcSetAndCorrect(colBPage1, snappy(colBPage1Bytes));
assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);
DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
assertCrcSetAndCorrect(colBPage2, snappy(colBPage2Bytes));
assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
}
}
use of org.apache.parquet.column.page.PageReadStore in project parquet-mr by apache.
the class TestDataPageV1Checksums method testWriteOnVerifyOn.
/**
* Write out checksums and verify them on the read path. Tests that crc is set and that we can
* read back what we wrote if checksums are enabled on both the write and read path.
*/
@Test
public void testWriteOnVerifyOn() throws IOException {
Configuration conf = new Configuration();
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
PageReadStore pageReadStore = reader.readNextRowGroup();
DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
assertCrcSetAndCorrect(colAPage1, colAPage1Bytes);
assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);
DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
assertCrcSetAndCorrect(colAPage2, colAPage2Bytes);
assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);
DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
assertCrcSetAndCorrect(colBPage1, colBPage1Bytes);
assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);
DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
assertCrcSetAndCorrect(colBPage2, colBPage2Bytes);
assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
}
}
Aggregations