use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.
the class ParquetSplitReaderUtil method genPartColumnarRowReader.
/**
* Util for generating partitioned {@link ParquetColumnarRowSplitReader}.
*/
public static ParquetColumnarRowSplitReader genPartColumnarRowReader(boolean utcTimestamp, boolean caseSensitive, Configuration conf, String[] fullFieldNames, DataType[] fullFieldTypes, Map<String, Object> partitionSpec, int[] selectedFields, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
List<String> nonPartNames = Arrays.stream(fullFieldNames).filter(n -> !partitionSpec.containsKey(n)).collect(Collectors.toList());
List<String> selNonPartNames = Arrays.stream(selectedFields).mapToObj(i -> fullFieldNames[i]).filter(nonPartNames::contains).collect(Collectors.toList());
int[] selParquetFields = selNonPartNames.stream().mapToInt(nonPartNames::indexOf).toArray();
ParquetColumnarRowSplitReader.ColumnBatchGenerator gen = readVectors -> {
// create and initialize the row batch
ColumnVector[] vectors = new ColumnVector[selectedFields.length];
for (int i = 0; i < vectors.length; i++) {
String name = fullFieldNames[selectedFields[i]];
LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType();
vectors[i] = partitionSpec.containsKey(name) ? createVectorFromConstant(type, partitionSpec.get(name), batchSize) : readVectors[selNonPartNames.indexOf(name)];
}
return new VectorizedColumnBatch(vectors);
};
return new ParquetColumnarRowSplitReader(utcTimestamp, caseSensitive, conf, Arrays.stream(selParquetFields).mapToObj(i -> fullFieldTypes[i].getLogicalType()).toArray(LogicalType[]::new), selNonPartNames.toArray(new String[0]), gen, batchSize, new org.apache.hadoop.fs.Path(path.toUri()), splitStart, splitLength);
}
use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.
the class OrcNoHiveSplitReaderUtil method genPartColumnarRowReader.
/**
* Util for generating partitioned {@link OrcColumnarRowSplitReader}.
*/
public static OrcColumnarRowSplitReader<VectorizedRowBatch> genPartColumnarRowReader(Configuration conf, String[] fullFieldNames, DataType[] fullFieldTypes, Map<String, Object> partitionSpec, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
List<String> nonPartNames = getNonPartNames(fullFieldNames, partitionSpec);
int[] selectedOrcFields = getSelectedOrcFields(fullFieldNames, selectedFields, nonPartNames);
OrcColumnarRowSplitReader.ColumnBatchGenerator<VectorizedRowBatch> gen = (VectorizedRowBatch rowBatch) -> {
// create and initialize the row batch
ColumnVector[] vectors = new ColumnVector[selectedFields.length];
for (int i = 0; i < vectors.length; i++) {
String name = fullFieldNames[selectedFields[i]];
LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType();
vectors[i] = partitionSpec.containsKey(name) ? createFlinkVectorFromConstant(type, partitionSpec.get(name), batchSize) : createFlinkVector(rowBatch.cols[nonPartNames.indexOf(name)]);
}
return new VectorizedColumnBatch(vectors);
};
return new OrcColumnarRowSplitReader<>(new OrcNoHiveShim(), conf, convertToOrcTypeWithPart(fullFieldNames, fullFieldTypes, partitionSpec.keySet()), selectedOrcFields, gen, conjunctPredicates, batchSize, path, splitStart, splitLength);
}
use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.
the class OrcSplitReaderUtil method genPartColumnarRowReader.
/**
* Util for generating partitioned {@link OrcColumnarRowSplitReader}.
*/
public static OrcColumnarRowSplitReader<VectorizedRowBatch> genPartColumnarRowReader(String hiveVersion, Configuration conf, String[] fullFieldNames, DataType[] fullFieldTypes, Map<String, Object> partitionSpec, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
List<String> nonPartNames = getNonPartNames(fullFieldNames, partitionSpec);
int[] selectedOrcFields = getSelectedOrcFields(fullFieldNames, selectedFields, nonPartNames);
ColumnBatchGenerator<VectorizedRowBatch> gen = (VectorizedRowBatch rowBatch) -> {
// create and initialize the row batch
ColumnVector[] vectors = new ColumnVector[selectedFields.length];
for (int i = 0; i < vectors.length; i++) {
String name = fullFieldNames[selectedFields[i]];
LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType();
vectors[i] = partitionSpec.containsKey(name) ? createFlinkVectorFromConstant(type, partitionSpec.get(name), batchSize) : createFlinkVector(rowBatch.cols[nonPartNames.indexOf(name)], type);
}
return new VectorizedColumnBatch(vectors);
};
return new OrcColumnarRowSplitReader<>(OrcShim.createShim(hiveVersion), conf, convertToOrcTypeWithPart(fullFieldNames, fullFieldTypes, partitionSpec.keySet()), selectedOrcFields, gen, conjunctPredicates, batchSize, path, splitStart, splitLength);
}
use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.
the class ParquetVectorizedInputFormat method createReaderBatch.
private ParquetReaderBatch<T> createReaderBatch(SplitT split, MessageType requestedSchema, Pool.Recycler<ParquetReaderBatch<T>> recycler) {
WritableColumnVector[] writableVectors = createWritableVectors(requestedSchema);
VectorizedColumnBatch columnarBatch = batchFactory.create(split, createReadableVectors(writableVectors));
return createReaderBatch(writableVectors, columnarBatch, recycler);
}
Aggregations