use of org.apache.parquet.io.RecordReader in project drill by apache.
the class DrillParquetReader method setup.
@Override
public void setup(OperatorContext context, OutputMutator output) throws ExecutionSetupException {
try {
this.operatorContext = context;
schema = footer.getFileMetaData().getSchema();
MessageType projection;
final List<SchemaPath> columnsNotFound = new ArrayList<>(getColumns().size());
if (isStarQuery()) {
projection = schema;
} else {
projection = getProjection(schema, getColumns(), columnsNotFound);
if (projection == null) {
projection = schema;
}
if (!columnsNotFound.isEmpty()) {
nullFilledVectors = new ArrayList<>(columnsNotFound.size());
for (SchemaPath col : columnsNotFound) {
// col.toExpr() is used here as field name since we don't want to see these fields in the existing maps
nullFilledVectors.add(output.addField(MaterializedField.create(col.toExpr(), OPTIONAL_INT), NullableIntVector.class));
}
noColumnsFound = columnsNotFound.size() == getColumns().size();
}
}
logger.debug("Requesting schema {}", projection);
if (!noColumnsFound) {
// Discard the columns not found in the schema when create DrillParquetRecordMaterializer, since they have been added to output already.
@SuppressWarnings("unchecked") Collection<SchemaPath> columns = columnsNotFound.isEmpty() ? getColumns() : CollectionUtils.subtract(getColumns(), columnsNotFound);
recordMaterializer = new DrillParquetRecordMaterializer(output, projection, columns, fragmentContext.getOptions(), containsCorruptedDates);
}
if (numRecordsToRead == 0 || noColumnsFound) {
// no need to init readers
return;
}
ColumnIOFactory factory = new ColumnIOFactory(false);
MessageColumnIO columnIO = factory.getColumnIO(projection, schema);
BlockMetaData blockMetaData = footer.getBlocks().get(entry.getRowGroupIndex());
Map<ColumnPath, ColumnChunkMetaData> paths = blockMetaData.getColumns().stream().collect(Collectors.toMap(ColumnChunkMetaData::getPath, Function.identity(), (o, n) -> n));
BufferAllocator allocator = operatorContext.getAllocator();
CompressionCodecFactory ccf = DrillCompressionCodecFactory.createDirectCodecFactory(drillFileSystem.getConf(), new ParquetDirectByteBufferAllocator(allocator), 0);
pageReadStore = new ColumnChunkIncReadStore(numRecordsToRead, ccf, allocator, drillFileSystem, entry.getPath());
for (String[] path : schema.getPaths()) {
Type type = schema.getType(path);
if (type.isPrimitive()) {
ColumnChunkMetaData md = paths.get(ColumnPath.get(path));
pageReadStore.addColumn(schema.getColumnDescription(path), md);
}
}
recordReader = columnIO.getRecordReader(pageReadStore, recordMaterializer);
} catch (Exception e) {
throw handleAndRaise("Failure in setting up reader", e);
}
}
use of org.apache.parquet.io.RecordReader in project ignite by apache.
the class SparkModelParser method loadKMeansModel.
/**
* Load K-Means model.
*
* @param pathToMdl Path to model.
* @param learningEnvironment learningEnvironment
*/
private static Model loadKMeansModel(String pathToMdl, LearningEnvironment learningEnvironment) {
Vector[] centers = null;
try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
PageReadStore pages;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
while (null != (pages = r.readNextRowGroup())) {
final int rows = (int) pages.getRowCount();
final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
centers = new DenseVector[rows];
for (int i = 0; i < rows; i++) {
final SimpleGroup g = (SimpleGroup) recordReader.read();
// final int clusterIdx = g.getInteger(0, 0);
Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0);
final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0);
centers[i] = new DenseVector(amountOfCoefficients);
for (int j = 0; j < amountOfCoefficients; j++) {
double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0);
centers[i].set(j, coefficient);
}
}
}
} catch (IOException e) {
String msg = "Error reading parquet file: " + e.getMessage();
learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
e.printStackTrace();
}
return new KMeansModel(centers, new EuclideanDistance());
}
use of org.apache.parquet.io.RecordReader in project ignite by apache.
the class SparkModelParser method loadLinearSVMModel.
/**
* Load SVM model.
*
* @param pathToMdl Path to model.
* @param learningEnvironment Learning environment.
*/
private static Model loadLinearSVMModel(String pathToMdl, LearningEnvironment learningEnvironment) {
Vector coefficients = null;
double interceptor = 0;
try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
PageReadStore pages;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
final SimpleGroup g = (SimpleGroup) recordReader.read();
interceptor = readSVMInterceptor(g);
coefficients = readSVMCoefficients(g);
}
}
} catch (IOException e) {
String msg = "Error reading parquet file: " + e.getMessage();
learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
e.printStackTrace();
}
return new SVMLinearClassificationModel(coefficients, interceptor);
}
use of org.apache.parquet.io.RecordReader in project ignite by apache.
the class SparkModelParser method loadDecisionTreeModel.
/**
* Load Decision Tree model.
*
* @param pathToMdl Path to model.
* @param learningEnvironment Learning environment.
*/
private static Model loadDecisionTreeModel(String pathToMdl, LearningEnvironment learningEnvironment) {
try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
PageReadStore pages;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
final Map<Integer, NodeData> nodes = new TreeMap<>();
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
final SimpleGroup g = (SimpleGroup) recordReader.read();
NodeData nodeData = extractNodeDataFromParquetRow(g);
nodes.put(nodeData.id, nodeData);
}
}
return buildDecisionTreeModel(nodes);
} catch (IOException e) {
String msg = "Error reading parquet file: " + e.getMessage();
learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
e.printStackTrace();
}
return null;
}
use of org.apache.parquet.io.RecordReader in project ignite by apache.
the class SparkModelParser method loadLogRegModel.
/**
* Load logistic regression model.
*
* @param pathToMdl Path to model.
* @param learningEnvironment Learning environment.
*/
private static Model loadLogRegModel(String pathToMdl, LearningEnvironment learningEnvironment) {
Vector coefficients = null;
double interceptor = 0;
try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
PageReadStore pages;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
final SimpleGroup g = (SimpleGroup) recordReader.read();
interceptor = readInterceptor(g);
coefficients = readCoefficients(g);
}
}
} catch (IOException e) {
String msg = "Error reading parquet file: " + e.getMessage();
learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
e.printStackTrace();
}
return new LogisticRegressionModel(coefficients, interceptor);
}
Aggregations