use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project flink by apache.
the class OrcNoHiveColumnarRowInputFormat method createPartitionedFormat.
/**
* Create a partitioned {@link OrcColumnarRowInputFormat}, the partition columns can be
* generated by split.
*/
public static <SplitT extends FileSourceSplit> OrcColumnarRowInputFormat<VectorizedRowBatch, SplitT> createPartitionedFormat(Configuration hadoopConfig, RowType tableType, List<String> partitionKeys, PartitionFieldExtractor<SplitT> extractor, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Function<RowType, TypeInformation<RowData>> rowTypeInfoFactory) {
// TODO FLINK-25113 all this partition keys code should be pruned from the orc format,
// because now FileSystemTableSource uses FileInfoExtractorBulkFormat for reading partition
// keys.
String[] tableFieldNames = tableType.getFieldNames().toArray(new String[0]);
LogicalType[] tableFieldTypes = tableType.getChildren().toArray(new LogicalType[0]);
List<String> orcFieldNames = getNonPartNames(tableFieldNames, partitionKeys);
int[] orcSelectedFields = getSelectedOrcFields(tableFieldNames, selectedFields, orcFieldNames);
ColumnBatchFactory<VectorizedRowBatch, SplitT> batchGenerator = (SplitT split, VectorizedRowBatch rowBatch) -> {
// create and initialize the row batch
ColumnVector[] vectors = new ColumnVector[selectedFields.length];
for (int i = 0; i < vectors.length; i++) {
String name = tableFieldNames[selectedFields[i]];
LogicalType type = tableFieldTypes[selectedFields[i]];
vectors[i] = partitionKeys.contains(name) ? createFlinkVectorFromConstant(type, extractor.extract(split, name, type), batchSize) : createFlinkVector(rowBatch.cols[orcFieldNames.indexOf(name)]);
}
return new VectorizedColumnBatch(vectors);
};
return new OrcColumnarRowInputFormat<>(new OrcNoHiveShim(), hadoopConfig, convertToOrcTypeWithPart(tableFieldNames, tableFieldTypes, partitionKeys), orcSelectedFields, conjunctPredicates, batchSize, batchGenerator, rowTypeInfoFactory.apply(new RowType(Arrays.stream(selectedFields).mapToObj(i -> tableType.getFields().get(i)).collect(Collectors.toList()))));
}
use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project flink by apache.
the class OrcNoHiveBulkWriterFactory method create.
@Override
public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
OrcFile.WriterOptions opts = OrcFile.writerOptions(new Properties(), conf);
TypeDescription description = TypeDescription.fromString(schema);
opts.setSchema(description);
opts.physicalWriter(new NoHivePhysicalWriterImpl(out, opts));
WriterImpl writer = new WriterImpl(null, new Path("."), opts);
VectorizedRowBatch rowBatch = description.createRowBatch();
return new BulkWriter<RowData>() {
@Override
public void addElement(RowData row) throws IOException {
int rowId = rowBatch.size++;
for (int i = 0; i < row.getArity(); ++i) {
setColumn(rowId, rowBatch.cols[i], fieldTypes[i], row, i);
}
if (rowBatch.size == rowBatch.getMaxSize()) {
writer.addRowBatch(rowBatch);
rowBatch.reset();
}
}
@Override
public void flush() throws IOException {
if (rowBatch.size != 0) {
writer.addRowBatch(rowBatch);
rowBatch.reset();
}
}
@Override
public void finish() throws IOException {
flush();
writer.close();
}
};
}
use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project flink by apache.
the class OrcColumnarRowSplitReaderNoHiveTest method prepareReadFileWithTypes.
@Override
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
// NOTE: orc has field name information, so name should be same as orc
TypeDescription schema = TypeDescription.fromString("struct<" + "f0:float," + "f1:double," + "f2:timestamp," + "f3:tinyint," + "f4:smallint" + ">");
org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
Configuration conf = new Configuration();
Writer writer = OrcFile.createWriter(filePath, OrcFile.writerOptions(conf).setSchema(schema));
VectorizedRowBatch batch = schema.createRowBatch(rowSize);
DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
LongColumnVector col3 = (LongColumnVector) batch.cols[3];
LongColumnVector col4 = (LongColumnVector) batch.cols[4];
col0.noNulls = false;
col1.noNulls = false;
col2.noNulls = false;
col3.noNulls = false;
col4.noNulls = false;
for (int i = 0; i < rowSize - 1; i++) {
col0.vector[i] = i;
col1.vector[i] = i;
Timestamp timestamp = toTimestamp(i);
col2.time[i] = timestamp.getTime();
col2.nanos[i] = timestamp.getNanos();
col3.vector[i] = i;
col4.vector[i] = i;
}
col0.isNull[rowSize - 1] = true;
col1.isNull[rowSize - 1] = true;
col2.isNull[rowSize - 1] = true;
col3.isNull[rowSize - 1] = true;
col4.isNull[rowSize - 1] = true;
batch.size = rowSize;
writer.addRowBatch(batch);
batch.reset();
writer.close();
}
use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project incubator-gobblin by apache.
the class GenericRecordToOrcValueWriterTest method deserializeOrcRecords.
public static final List<Writable> deserializeOrcRecords(Path orcFilePath, FileSystem fs) throws IOException {
org.apache.orc.Reader fileReader = OrcFile.createReader(orcFilePath, new OrcFile.ReaderOptions(new Configuration()));
RecordReader recordReader = fileReader.rows();
TypeDescription schema = fileReader.getSchema();
VectorizedRowBatch batch = schema.createRowBatch();
recordReader.nextBatch(batch);
int rowInBatch = 0;
// result container
List<Writable> orcRecords = new ArrayList<>();
long rowCount = fileReader.getNumberOfRows();
while (rowCount > 0) {
// Deserialize records using Mapreduce-like API
if (schema.getCategory() == TypeDescription.Category.STRUCT) {
OrcStruct result = (OrcStruct) OrcStruct.createValue(fileReader.getSchema());
List<TypeDescription> children = schema.getChildren();
int numberOfChildren = children.size();
for (int i = 0; i < numberOfChildren; ++i) {
result.setFieldValue(i, nextValue(batch.cols[i], rowInBatch, children.get(i), result.getFieldValue(i)));
}
orcRecords.add(result);
} else {
throw new UnsupportedOperationException("The serialized records have to be a struct in the outer-most layer.");
}
rowCount -= 1;
rowInBatch += 1;
}
return orcRecords;
}
use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project flink by apache.
the class OrcNoHiveSplitReaderUtil method genPartColumnarRowReader.
/**
* Util for generating partitioned {@link OrcColumnarRowSplitReader}.
*/
public static OrcColumnarRowSplitReader<VectorizedRowBatch> genPartColumnarRowReader(Configuration conf, String[] fullFieldNames, DataType[] fullFieldTypes, Map<String, Object> partitionSpec, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
List<String> nonPartNames = getNonPartNames(fullFieldNames, partitionSpec);
int[] selectedOrcFields = getSelectedOrcFields(fullFieldNames, selectedFields, nonPartNames);
OrcColumnarRowSplitReader.ColumnBatchGenerator<VectorizedRowBatch> gen = (VectorizedRowBatch rowBatch) -> {
// create and initialize the row batch
ColumnVector[] vectors = new ColumnVector[selectedFields.length];
for (int i = 0; i < vectors.length; i++) {
String name = fullFieldNames[selectedFields[i]];
LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType();
vectors[i] = partitionSpec.containsKey(name) ? createFlinkVectorFromConstant(type, partitionSpec.get(name), batchSize) : createFlinkVector(rowBatch.cols[nonPartNames.indexOf(name)]);
}
return new VectorizedColumnBatch(vectors);
};
return new OrcColumnarRowSplitReader<>(new OrcNoHiveShim(), conf, convertToOrcTypeWithPart(fullFieldNames, fullFieldTypes, partitionSpec.keySet()), selectedOrcFields, gen, conjunctPredicates, batchSize, path, splitStart, splitLength);
}
Aggregations