use of org.apache.flink.table.data.columnar.vector.ColumnVector in project flink by apache.
the class ParquetColumnarRowInputFormat method createPartitionedFormat.
/**
* Create a partitioned {@link ParquetColumnarRowInputFormat}, the partition columns can be
* generated by {@link Path}.
*/
public static <SplitT extends FileSourceSplit> ParquetColumnarRowInputFormat<SplitT> createPartitionedFormat(Configuration hadoopConfig, RowType producedRowType, TypeInformation<RowData> producedTypeInfo, List<String> partitionKeys, PartitionFieldExtractor<SplitT> extractor, int batchSize, boolean isUtcTimestamp, boolean isCaseSensitive) {
// TODO FLINK-25113 all this partition keys code should be pruned from the parquet format,
// because now FileSystemTableSource uses FileInfoExtractorBulkFormat for reading partition
// keys.
RowType projectedRowType = new RowType(producedRowType.getFields().stream().filter(field -> !partitionKeys.contains(field.getName())).collect(Collectors.toList()));
List<String> projectedNames = projectedRowType.getFieldNames();
ColumnBatchFactory<SplitT> factory = (SplitT split, ColumnVector[] parquetVectors) -> {
// create and initialize the row batch
ColumnVector[] vectors = new ColumnVector[producedRowType.getFieldCount()];
for (int i = 0; i < vectors.length; i++) {
RowType.RowField field = producedRowType.getFields().get(i);
vectors[i] = partitionKeys.contains(field.getName()) ? createVectorFromConstant(field.getType(), extractor.extract(split, field.getName(), field.getType()), batchSize) : parquetVectors[projectedNames.indexOf(field.getName())];
}
return new VectorizedColumnBatch(vectors);
};
return new ParquetColumnarRowInputFormat<>(hadoopConfig, projectedRowType, producedTypeInfo, factory, batchSize, isUtcTimestamp, isCaseSensitive);
}
use of org.apache.flink.table.data.columnar.vector.ColumnVector in project flink by apache.
the class ParquetSplitReaderUtil method genPartColumnarRowReader.
/**
* Util for generating partitioned {@link ParquetColumnarRowSplitReader}.
*/
public static ParquetColumnarRowSplitReader genPartColumnarRowReader(boolean utcTimestamp, boolean caseSensitive, Configuration conf, String[] fullFieldNames, DataType[] fullFieldTypes, Map<String, Object> partitionSpec, int[] selectedFields, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
List<String> nonPartNames = Arrays.stream(fullFieldNames).filter(n -> !partitionSpec.containsKey(n)).collect(Collectors.toList());
List<String> selNonPartNames = Arrays.stream(selectedFields).mapToObj(i -> fullFieldNames[i]).filter(nonPartNames::contains).collect(Collectors.toList());
int[] selParquetFields = selNonPartNames.stream().mapToInt(nonPartNames::indexOf).toArray();
ParquetColumnarRowSplitReader.ColumnBatchGenerator gen = readVectors -> {
// create and initialize the row batch
ColumnVector[] vectors = new ColumnVector[selectedFields.length];
for (int i = 0; i < vectors.length; i++) {
String name = fullFieldNames[selectedFields[i]];
LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType();
vectors[i] = partitionSpec.containsKey(name) ? createVectorFromConstant(type, partitionSpec.get(name), batchSize) : readVectors[selNonPartNames.indexOf(name)];
}
return new VectorizedColumnBatch(vectors);
};
return new ParquetColumnarRowSplitReader(utcTimestamp, caseSensitive, conf, Arrays.stream(selParquetFields).mapToObj(i -> fullFieldTypes[i].getLogicalType()).toArray(LogicalType[]::new), selNonPartNames.toArray(new String[0]), gen, batchSize, new org.apache.hadoop.fs.Path(path.toUri()), splitStart, splitLength);
}
use of org.apache.flink.table.data.columnar.vector.ColumnVector in project flink by apache.
the class ArrowUtilsTest method testCreateArrowReader.
@Test
public void testCreateArrowReader() {
VectorSchemaRoot root = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(rowType), allocator);
ArrowReader reader = ArrowUtils.createArrowReader(root, rowType);
ColumnVector[] columnVectors = reader.getColumnVectors();
for (int i = 0; i < columnVectors.length; i++) {
assertEquals(testFields.get(i).f4, columnVectors[i].getClass());
}
}
use of org.apache.flink.table.data.columnar.vector.ColumnVector in project flink by apache.
the class ParquetSplitReaderUtil method createVectorFromConstant.
public static ColumnVector createVectorFromConstant(LogicalType type, Object value, int batchSize) {
switch(type.getTypeRoot()) {
case CHAR:
case VARCHAR:
case BINARY:
case VARBINARY:
HeapBytesVector bsv = new HeapBytesVector(batchSize);
if (value == null) {
bsv.fillWithNulls();
} else {
bsv.fill(value instanceof byte[] ? (byte[]) value : value.toString().getBytes(StandardCharsets.UTF_8));
}
return bsv;
case BOOLEAN:
HeapBooleanVector bv = new HeapBooleanVector(batchSize);
if (value == null) {
bv.fillWithNulls();
} else {
bv.fill((boolean) value);
}
return bv;
case TINYINT:
HeapByteVector byteVector = new HeapByteVector(batchSize);
if (value == null) {
byteVector.fillWithNulls();
} else {
byteVector.fill(((Number) value).byteValue());
}
return byteVector;
case SMALLINT:
HeapShortVector sv = new HeapShortVector(batchSize);
if (value == null) {
sv.fillWithNulls();
} else {
sv.fill(((Number) value).shortValue());
}
return sv;
case INTEGER:
HeapIntVector iv = new HeapIntVector(batchSize);
if (value == null) {
iv.fillWithNulls();
} else {
iv.fill(((Number) value).intValue());
}
return iv;
case BIGINT:
HeapLongVector lv = new HeapLongVector(batchSize);
if (value == null) {
lv.fillWithNulls();
} else {
lv.fill(((Number) value).longValue());
}
return lv;
case DECIMAL:
DecimalType decimalType = (DecimalType) type;
int precision = decimalType.getPrecision();
int scale = decimalType.getScale();
DecimalData decimal = value == null ? null : Preconditions.checkNotNull(DecimalData.fromBigDecimal((BigDecimal) value, precision, scale));
ColumnVector internalVector;
if (ParquetSchemaConverter.is32BitDecimal(precision)) {
internalVector = createVectorFromConstant(new IntType(), decimal == null ? null : (int) decimal.toUnscaledLong(), batchSize);
} else if (ParquetSchemaConverter.is64BitDecimal(precision)) {
internalVector = createVectorFromConstant(new BigIntType(), decimal == null ? null : decimal.toUnscaledLong(), batchSize);
} else {
internalVector = createVectorFromConstant(new VarBinaryType(), decimal == null ? null : decimal.toUnscaledBytes(), batchSize);
}
return new ParquetDecimalVector(internalVector);
case FLOAT:
HeapFloatVector fv = new HeapFloatVector(batchSize);
if (value == null) {
fv.fillWithNulls();
} else {
fv.fill(((Number) value).floatValue());
}
return fv;
case DOUBLE:
HeapDoubleVector dv = new HeapDoubleVector(batchSize);
if (value == null) {
dv.fillWithNulls();
} else {
dv.fill(((Number) value).doubleValue());
}
return dv;
case DATE:
if (value instanceof LocalDate) {
value = Date.valueOf((LocalDate) value);
}
return createVectorFromConstant(new IntType(), value == null ? null : toInternal((Date) value), batchSize);
case TIMESTAMP_WITHOUT_TIME_ZONE:
HeapTimestampVector tv = new HeapTimestampVector(batchSize);
if (value == null) {
tv.fillWithNulls();
} else {
tv.fill(TimestampData.fromLocalDateTime((LocalDateTime) value));
}
return tv;
default:
throw new UnsupportedOperationException("Unsupported type: " + type);
}
}
use of org.apache.flink.table.data.columnar.vector.ColumnVector in project flink by apache.
the class ArrowUtils method createColumnVector.
public static ColumnVector createColumnVector(ValueVector vector, LogicalType fieldType) {
if (vector instanceof TinyIntVector) {
return new ArrowTinyIntColumnVector((TinyIntVector) vector);
} else if (vector instanceof SmallIntVector) {
return new ArrowSmallIntColumnVector((SmallIntVector) vector);
} else if (vector instanceof IntVector) {
return new ArrowIntColumnVector((IntVector) vector);
} else if (vector instanceof BigIntVector) {
return new ArrowBigIntColumnVector((BigIntVector) vector);
} else if (vector instanceof BitVector) {
return new ArrowBooleanColumnVector((BitVector) vector);
} else if (vector instanceof Float4Vector) {
return new ArrowFloatColumnVector((Float4Vector) vector);
} else if (vector instanceof Float8Vector) {
return new ArrowDoubleColumnVector((Float8Vector) vector);
} else if (vector instanceof VarCharVector) {
return new ArrowVarCharColumnVector((VarCharVector) vector);
} else if (vector instanceof VarBinaryVector) {
return new ArrowVarBinaryColumnVector((VarBinaryVector) vector);
} else if (vector instanceof DecimalVector) {
return new ArrowDecimalColumnVector((DecimalVector) vector);
} else if (vector instanceof DateDayVector) {
return new ArrowDateColumnVector((DateDayVector) vector);
} else if (vector instanceof TimeSecVector || vector instanceof TimeMilliVector || vector instanceof TimeMicroVector || vector instanceof TimeNanoVector) {
return new ArrowTimeColumnVector(vector);
} else if (vector instanceof TimeStampVector && ((ArrowType.Timestamp) vector.getField().getType()).getTimezone() == null) {
return new ArrowTimestampColumnVector(vector);
} else if (vector instanceof ListVector) {
ListVector listVector = (ListVector) vector;
return new ArrowArrayColumnVector(listVector, createColumnVector(listVector.getDataVector(), ((ArrayType) fieldType).getElementType()));
} else if (vector instanceof StructVector) {
StructVector structVector = (StructVector) vector;
ColumnVector[] fieldColumns = new ColumnVector[structVector.size()];
for (int i = 0; i < fieldColumns.length; ++i) {
fieldColumns[i] = createColumnVector(structVector.getVectorById(i), ((RowType) fieldType).getTypeAt(i));
}
return new ArrowRowColumnVector(structVector, fieldColumns);
} else {
throw new UnsupportedOperationException(String.format("Unsupported type %s.", fieldType));
}
}
Aggregations