use of org.apache.flink.orc.OrcColumnarRowInputFormat in project flink by apache.
the class OrcNoHiveColumnarRowInputFormat method createPartitionedFormat.
/**
* Create a partitioned {@link OrcColumnarRowInputFormat}, the partition columns can be
* generated by split.
*/
public static <SplitT extends FileSourceSplit> OrcColumnarRowInputFormat<VectorizedRowBatch, SplitT> createPartitionedFormat(Configuration hadoopConfig, RowType tableType, List<String> partitionKeys, PartitionFieldExtractor<SplitT> extractor, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Function<RowType, TypeInformation<RowData>> rowTypeInfoFactory) {
// TODO FLINK-25113 all this partition keys code should be pruned from the orc format,
// because now FileSystemTableSource uses FileInfoExtractorBulkFormat for reading partition
// keys.
String[] tableFieldNames = tableType.getFieldNames().toArray(new String[0]);
LogicalType[] tableFieldTypes = tableType.getChildren().toArray(new LogicalType[0]);
List<String> orcFieldNames = getNonPartNames(tableFieldNames, partitionKeys);
int[] orcSelectedFields = getSelectedOrcFields(tableFieldNames, selectedFields, orcFieldNames);
ColumnBatchFactory<VectorizedRowBatch, SplitT> batchGenerator = (SplitT split, VectorizedRowBatch rowBatch) -> {
// create and initialize the row batch
ColumnVector[] vectors = new ColumnVector[selectedFields.length];
for (int i = 0; i < vectors.length; i++) {
String name = tableFieldNames[selectedFields[i]];
LogicalType type = tableFieldTypes[selectedFields[i]];
vectors[i] = partitionKeys.contains(name) ? createFlinkVectorFromConstant(type, extractor.extract(split, name, type), batchSize) : createFlinkVector(rowBatch.cols[orcFieldNames.indexOf(name)]);
}
return new VectorizedColumnBatch(vectors);
};
return new OrcColumnarRowInputFormat<>(new OrcNoHiveShim(), hadoopConfig, convertToOrcTypeWithPart(tableFieldNames, tableFieldTypes, partitionKeys), orcSelectedFields, conjunctPredicates, batchSize, batchGenerator, rowTypeInfoFactory.apply(new RowType(Arrays.stream(selectedFields).mapToObj(i -> tableType.getFields().get(i)).collect(Collectors.toList()))));
}
Aggregations