Search in sources :

Example 6 with VectorizedColumnBatch

use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.

the class ParquetSplitReaderUtil method genPartColumnarRowReader.

/**
 * Util for generating partitioned {@link ParquetColumnarRowSplitReader}.
 */
public static ParquetColumnarRowSplitReader genPartColumnarRowReader(boolean utcTimestamp, boolean caseSensitive, Configuration conf, String[] fullFieldNames, DataType[] fullFieldTypes, Map<String, Object> partitionSpec, int[] selectedFields, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
    List<String> nonPartNames = Arrays.stream(fullFieldNames).filter(n -> !partitionSpec.containsKey(n)).collect(Collectors.toList());
    List<String> selNonPartNames = Arrays.stream(selectedFields).mapToObj(i -> fullFieldNames[i]).filter(nonPartNames::contains).collect(Collectors.toList());
    int[] selParquetFields = selNonPartNames.stream().mapToInt(nonPartNames::indexOf).toArray();
    ParquetColumnarRowSplitReader.ColumnBatchGenerator gen = readVectors -> {
        // create and initialize the row batch
        ColumnVector[] vectors = new ColumnVector[selectedFields.length];
        for (int i = 0; i < vectors.length; i++) {
            String name = fullFieldNames[selectedFields[i]];
            LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType();
            vectors[i] = partitionSpec.containsKey(name) ? createVectorFromConstant(type, partitionSpec.get(name), batchSize) : readVectors[selNonPartNames.indexOf(name)];
        }
        return new VectorizedColumnBatch(vectors);
    };
    return new ParquetColumnarRowSplitReader(utcTimestamp, caseSensitive, conf, Arrays.stream(selParquetFields).mapToObj(i -> fullFieldTypes[i].getLogicalType()).toArray(LogicalType[]::new), selNonPartNames.toArray(new String[0]), gen, batchSize, new org.apache.hadoop.fs.Path(path.toUri()), splitStart, splitLength);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) HeapTimestampVector(org.apache.flink.table.data.columnar.vector.heap.HeapTimestampVector) DataType(org.apache.flink.table.types.DataType) Arrays(java.util.Arrays) Preconditions.checkArgument(org.apache.parquet.Preconditions.checkArgument) FloatColumnReader(org.apache.flink.formats.parquet.vector.reader.FloatColumnReader) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) PageReader(org.apache.parquet.column.page.PageReader) HeapLongVector(org.apache.flink.table.data.columnar.vector.heap.HeapLongVector) HeapFloatVector(org.apache.flink.table.data.columnar.vector.heap.HeapFloatVector) IntColumnReader(org.apache.flink.formats.parquet.vector.reader.IntColumnReader) BigDecimal(java.math.BigDecimal) BytesColumnReader(org.apache.flink.formats.parquet.vector.reader.BytesColumnReader) DecimalType(org.apache.flink.table.types.logical.DecimalType) Path(org.apache.flink.core.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) ColumnVector(org.apache.flink.table.data.columnar.vector.ColumnVector) TimestampColumnReader(org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader) Preconditions(org.apache.flink.util.Preconditions) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) ShortColumnReader(org.apache.flink.formats.parquet.vector.reader.ShortColumnReader) WritableColumnVector(org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector) List(java.util.List) LogicalType(org.apache.flink.table.types.logical.LogicalType) HeapDoubleVector(org.apache.flink.table.data.columnar.vector.heap.HeapDoubleVector) DateTimeUtils.toInternal(org.apache.flink.table.utils.DateTimeUtils.toInternal) HeapIntVector(org.apache.flink.table.data.columnar.vector.heap.HeapIntVector) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) FixedLenBytesColumnReader(org.apache.flink.formats.parquet.vector.reader.FixedLenBytesColumnReader) LocalDate(java.time.LocalDate) BooleanColumnReader(org.apache.flink.formats.parquet.vector.reader.BooleanColumnReader) ColumnReader(org.apache.flink.formats.parquet.vector.reader.ColumnReader) IntType(org.apache.flink.table.types.logical.IntType) LocalDateTime(java.time.LocalDateTime) HeapBytesVector(org.apache.flink.table.data.columnar.vector.heap.HeapBytesVector) ParquetSchemaConverter(org.apache.flink.formats.parquet.utils.ParquetSchemaConverter) HeapByteVector(org.apache.flink.table.data.columnar.vector.heap.HeapByteVector) OriginalType(org.apache.parquet.schema.OriginalType) TimestampData(org.apache.flink.table.data.TimestampData) BigIntType(org.apache.flink.table.types.logical.BigIntType) ByteColumnReader(org.apache.flink.formats.parquet.vector.reader.ByteColumnReader) LongColumnReader(org.apache.flink.formats.parquet.vector.reader.LongColumnReader) DecimalData(org.apache.flink.table.data.DecimalData) IOException(java.io.IOException) Date(java.sql.Date) DoubleColumnReader(org.apache.flink.formats.parquet.vector.reader.DoubleColumnReader) HeapShortVector(org.apache.flink.table.data.columnar.vector.heap.HeapShortVector) VarBinaryType(org.apache.flink.table.types.logical.VarBinaryType) HeapBooleanVector(org.apache.flink.table.data.columnar.vector.heap.HeapBooleanVector) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) LogicalType(org.apache.flink.table.types.logical.LogicalType)

Example 7 with VectorizedColumnBatch

use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.

the class OrcNoHiveSplitReaderUtil method genPartColumnarRowReader.

/**
 * Util for generating partitioned {@link OrcColumnarRowSplitReader}.
 */
public static OrcColumnarRowSplitReader<VectorizedRowBatch> genPartColumnarRowReader(Configuration conf, String[] fullFieldNames, DataType[] fullFieldTypes, Map<String, Object> partitionSpec, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
    List<String> nonPartNames = getNonPartNames(fullFieldNames, partitionSpec);
    int[] selectedOrcFields = getSelectedOrcFields(fullFieldNames, selectedFields, nonPartNames);
    OrcColumnarRowSplitReader.ColumnBatchGenerator<VectorizedRowBatch> gen = (VectorizedRowBatch rowBatch) -> {
        // create and initialize the row batch
        ColumnVector[] vectors = new ColumnVector[selectedFields.length];
        for (int i = 0; i < vectors.length; i++) {
            String name = fullFieldNames[selectedFields[i]];
            LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType();
            vectors[i] = partitionSpec.containsKey(name) ? createFlinkVectorFromConstant(type, partitionSpec.get(name), batchSize) : createFlinkVector(rowBatch.cols[nonPartNames.indexOf(name)]);
        }
        return new VectorizedColumnBatch(vectors);
    };
    return new OrcColumnarRowSplitReader<>(new OrcNoHiveShim(), conf, convertToOrcTypeWithPart(fullFieldNames, fullFieldTypes, partitionSpec.keySet()), selectedOrcFields, gen, conjunctPredicates, batchSize, path, splitStart, splitLength);
}
Also used : VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) OrcNoHiveShim(org.apache.flink.orc.nohive.shim.OrcNoHiveShim) LogicalType(org.apache.flink.table.types.logical.LogicalType) OrcColumnarRowSplitReader(org.apache.flink.orc.OrcColumnarRowSplitReader)

Example 8 with VectorizedColumnBatch

use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.

the class OrcSplitReaderUtil method genPartColumnarRowReader.

/**
 * Util for generating partitioned {@link OrcColumnarRowSplitReader}.
 */
public static OrcColumnarRowSplitReader<VectorizedRowBatch> genPartColumnarRowReader(String hiveVersion, Configuration conf, String[] fullFieldNames, DataType[] fullFieldTypes, Map<String, Object> partitionSpec, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
    List<String> nonPartNames = getNonPartNames(fullFieldNames, partitionSpec);
    int[] selectedOrcFields = getSelectedOrcFields(fullFieldNames, selectedFields, nonPartNames);
    ColumnBatchGenerator<VectorizedRowBatch> gen = (VectorizedRowBatch rowBatch) -> {
        // create and initialize the row batch
        ColumnVector[] vectors = new ColumnVector[selectedFields.length];
        for (int i = 0; i < vectors.length; i++) {
            String name = fullFieldNames[selectedFields[i]];
            LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType();
            vectors[i] = partitionSpec.containsKey(name) ? createFlinkVectorFromConstant(type, partitionSpec.get(name), batchSize) : createFlinkVector(rowBatch.cols[nonPartNames.indexOf(name)], type);
        }
        return new VectorizedColumnBatch(vectors);
    };
    return new OrcColumnarRowSplitReader<>(OrcShim.createShim(hiveVersion), conf, convertToOrcTypeWithPart(fullFieldNames, fullFieldTypes, partitionSpec.keySet()), selectedOrcFields, gen, conjunctPredicates, batchSize, path, splitStart, splitLength);
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) LogicalType(org.apache.flink.table.types.logical.LogicalType)

Example 9 with VectorizedColumnBatch

use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.

the class ParquetVectorizedInputFormat method createReaderBatch.

private ParquetReaderBatch<T> createReaderBatch(SplitT split, MessageType requestedSchema, Pool.Recycler<ParquetReaderBatch<T>> recycler) {
    WritableColumnVector[] writableVectors = createWritableVectors(requestedSchema);
    VectorizedColumnBatch columnarBatch = batchFactory.create(split, createReadableVectors(writableVectors));
    return createReaderBatch(writableVectors, columnarBatch, recycler);
}
Also used : ParquetSplitReaderUtil.createWritableColumnVector(org.apache.flink.formats.parquet.vector.ParquetSplitReaderUtil.createWritableColumnVector) WritableColumnVector(org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch)

Aggregations

VectorizedColumnBatch (org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch)9 LogicalType (org.apache.flink.table.types.logical.LogicalType)6 WritableColumnVector (org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector)3 RowType (org.apache.flink.table.types.logical.RowType)3 Path (org.apache.flink.core.fs.Path)2 OrcNoHiveShim (org.apache.flink.orc.nohive.shim.OrcNoHiveShim)2 ColumnVector (org.apache.flink.table.data.columnar.vector.ColumnVector)2 BigIntType (org.apache.flink.table.types.logical.BigIntType)2 IntType (org.apache.flink.table.types.logical.IntType)2 Configuration (org.apache.hadoop.conf.Configuration)2 IOException (java.io.IOException)1 BigDecimal (java.math.BigDecimal)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Date (java.sql.Date)1 LocalDate (java.time.LocalDate)1 LocalDateTime (java.time.LocalDateTime)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 List (java.util.List)1 Map (java.util.Map)1