Search in sources :

Example 1 with VectorizedColumnBatch

use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.

the class ArrowReader method read.

/**
 * Read the specified row from underlying Arrow format data.
 */
public RowData read(int rowId) {
    reuseRow.setVectorizedColumnBatch(new VectorizedColumnBatch(columnVectors));
    reuseRow.setRowId(rowId);
    return reuseRow;
}
Also used : VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch)

Example 2 with VectorizedColumnBatch

use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.

the class OrcNoHiveColumnarRowInputFormat method createPartitionedFormat.

/**
 * Create a partitioned {@link OrcColumnarRowInputFormat}, the partition columns can be
 * generated by split.
 */
public static <SplitT extends FileSourceSplit> OrcColumnarRowInputFormat<VectorizedRowBatch, SplitT> createPartitionedFormat(Configuration hadoopConfig, RowType tableType, List<String> partitionKeys, PartitionFieldExtractor<SplitT> extractor, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Function<RowType, TypeInformation<RowData>> rowTypeInfoFactory) {
    // TODO FLINK-25113 all this partition keys code should be pruned from the orc format,
    // because now FileSystemTableSource uses FileInfoExtractorBulkFormat for reading partition
    // keys.
    String[] tableFieldNames = tableType.getFieldNames().toArray(new String[0]);
    LogicalType[] tableFieldTypes = tableType.getChildren().toArray(new LogicalType[0]);
    List<String> orcFieldNames = getNonPartNames(tableFieldNames, partitionKeys);
    int[] orcSelectedFields = getSelectedOrcFields(tableFieldNames, selectedFields, orcFieldNames);
    ColumnBatchFactory<VectorizedRowBatch, SplitT> batchGenerator = (SplitT split, VectorizedRowBatch rowBatch) -> {
        // create and initialize the row batch
        ColumnVector[] vectors = new ColumnVector[selectedFields.length];
        for (int i = 0; i < vectors.length; i++) {
            String name = tableFieldNames[selectedFields[i]];
            LogicalType type = tableFieldTypes[selectedFields[i]];
            vectors[i] = partitionKeys.contains(name) ? createFlinkVectorFromConstant(type, extractor.extract(split, name, type), batchSize) : createFlinkVector(rowBatch.cols[orcFieldNames.indexOf(name)]);
        }
        return new VectorizedColumnBatch(vectors);
    };
    return new OrcColumnarRowInputFormat<>(new OrcNoHiveShim(), hadoopConfig, convertToOrcTypeWithPart(tableFieldNames, tableFieldTypes, partitionKeys), orcSelectedFields, conjunctPredicates, batchSize, batchGenerator, rowTypeInfoFactory.apply(new RowType(Arrays.stream(selectedFields).mapToObj(i -> tableType.getFields().get(i)).collect(Collectors.toList()))));
}
Also used : LogicalType(org.apache.flink.table.types.logical.LogicalType) RowType(org.apache.flink.table.types.logical.RowType) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) OrcColumnarRowInputFormat(org.apache.flink.orc.OrcColumnarRowInputFormat) OrcNoHiveShim(org.apache.flink.orc.nohive.shim.OrcNoHiveShim)

Example 3 with VectorizedColumnBatch

use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.

the class OrcColumnarRowInputFormat method createPartitionedFormat.

/**
 * Create a partitioned {@link OrcColumnarRowInputFormat}, the partition columns can be
 * generated by split.
 */
public static <SplitT extends FileSourceSplit> OrcColumnarRowInputFormat<VectorizedRowBatch, SplitT> createPartitionedFormat(OrcShim<VectorizedRowBatch> shim, Configuration hadoopConfig, RowType tableType, List<String> partitionKeys, PartitionFieldExtractor<SplitT> extractor, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Function<RowType, TypeInformation<RowData>> rowTypeInfoFactory) {
    // TODO FLINK-25113 all this partition keys code should be pruned from the orc format,
    // because now FileSystemTableSource uses FileInfoExtractorBulkFormat for reading partition
    // keys.
    String[] tableFieldNames = tableType.getFieldNames().toArray(new String[0]);
    LogicalType[] tableFieldTypes = tableType.getChildren().toArray(new LogicalType[0]);
    List<String> orcFieldNames = getNonPartNames(tableFieldNames, partitionKeys);
    int[] orcSelectedFields = getSelectedOrcFields(tableFieldNames, selectedFields, orcFieldNames);
    ColumnBatchFactory<VectorizedRowBatch, SplitT> batchGenerator = (SplitT split, VectorizedRowBatch rowBatch) -> {
        // create and initialize the row batch
        ColumnVector[] vectors = new ColumnVector[selectedFields.length];
        for (int i = 0; i < vectors.length; i++) {
            String name = tableFieldNames[selectedFields[i]];
            LogicalType type = tableFieldTypes[selectedFields[i]];
            vectors[i] = partitionKeys.contains(name) ? createFlinkVectorFromConstant(type, extractor.extract(split, name, type), batchSize) : createFlinkVector(rowBatch.cols[orcFieldNames.indexOf(name)], type);
        }
        return new VectorizedColumnBatch(vectors);
    };
    return new OrcColumnarRowInputFormat<>(shim, hadoopConfig, convertToOrcTypeWithPart(tableFieldNames, tableFieldTypes, partitionKeys), orcSelectedFields, conjunctPredicates, batchSize, batchGenerator, rowTypeInfoFactory.apply(new RowType(Arrays.stream(selectedFields).mapToObj(i -> tableType.getFields().get(i)).collect(Collectors.toList()))));
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) LogicalType(org.apache.flink.table.types.logical.LogicalType) RowType(org.apache.flink.table.types.logical.RowType)

Example 4 with VectorizedColumnBatch

use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.

the class ParquetColumnarRowSplitReaderTest method testProject.

@Test
public void testProject() throws IOException {
    // prepare parquet file
    int number = 1000;
    List<Row> records = new ArrayList<>(number);
    for (int i = 0; i < number; i++) {
        Integer v = i;
        records.add(newRow(v));
    }
    Path testPath = createTempParquetFile(TEMPORARY_FOLDER.newFolder(), PARQUET_SCHEMA, records, rowGroupSize);
    // test reader
    LogicalType[] fieldTypes = new LogicalType[] { new DoubleType(), new TinyIntType(), new IntType() };
    ParquetColumnarRowSplitReader reader = new ParquetColumnarRowSplitReader(false, true, new Configuration(), fieldTypes, new String[] { "f7", "f2", "f4" }, VectorizedColumnBatch::new, 500, new org.apache.hadoop.fs.Path(testPath.getPath()), 0, Long.MAX_VALUE);
    int i = 0;
    while (!reader.reachedEnd()) {
        ColumnarRowData row = reader.nextRecord();
        assertEquals(i, row.getDouble(0), 0);
        assertEquals((byte) i, row.getByte(1));
        assertEquals(i, row.getInt(2));
        i++;
    }
    reader.close();
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) LogicalType(org.apache.flink.table.types.logical.LogicalType) TinyIntType(org.apache.flink.table.types.logical.TinyIntType) TinyIntType(org.apache.flink.table.types.logical.TinyIntType) IntType(org.apache.flink.table.types.logical.IntType) BigIntType(org.apache.flink.table.types.logical.BigIntType) SmallIntType(org.apache.flink.table.types.logical.SmallIntType) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) DoubleType(org.apache.flink.table.types.logical.DoubleType) ColumnarRowData(org.apache.flink.table.data.columnar.ColumnarRowData) Row(org.apache.flink.types.Row) Test(org.junit.Test)

Example 5 with VectorizedColumnBatch

use of org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch in project flink by apache.

the class ParquetColumnarRowInputFormat method createPartitionedFormat.

/**
 * Create a partitioned {@link ParquetColumnarRowInputFormat}, the partition columns can be
 * generated by {@link Path}.
 */
public static <SplitT extends FileSourceSplit> ParquetColumnarRowInputFormat<SplitT> createPartitionedFormat(Configuration hadoopConfig, RowType producedRowType, TypeInformation<RowData> producedTypeInfo, List<String> partitionKeys, PartitionFieldExtractor<SplitT> extractor, int batchSize, boolean isUtcTimestamp, boolean isCaseSensitive) {
    // TODO FLINK-25113 all this partition keys code should be pruned from the parquet format,
    // because now FileSystemTableSource uses FileInfoExtractorBulkFormat for reading partition
    // keys.
    RowType projectedRowType = new RowType(producedRowType.getFields().stream().filter(field -> !partitionKeys.contains(field.getName())).collect(Collectors.toList()));
    List<String> projectedNames = projectedRowType.getFieldNames();
    ColumnBatchFactory<SplitT> factory = (SplitT split, ColumnVector[] parquetVectors) -> {
        // create and initialize the row batch
        ColumnVector[] vectors = new ColumnVector[producedRowType.getFieldCount()];
        for (int i = 0; i < vectors.length; i++) {
            RowType.RowField field = producedRowType.getFields().get(i);
            vectors[i] = partitionKeys.contains(field.getName()) ? createVectorFromConstant(field.getType(), extractor.extract(split, field.getName(), field.getType()), batchSize) : parquetVectors[projectedNames.indexOf(field.getName())];
        }
        return new VectorizedColumnBatch(vectors);
    };
    return new ParquetColumnarRowInputFormat<>(hadoopConfig, projectedRowType, producedTypeInfo, factory, batchSize, isUtcTimestamp, isCaseSensitive);
}
Also used : VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) RowType(org.apache.flink.table.types.logical.RowType) WritableColumnVector(org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector) ColumnVector(org.apache.flink.table.data.columnar.vector.ColumnVector)

Aggregations

VectorizedColumnBatch (org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch)9 LogicalType (org.apache.flink.table.types.logical.LogicalType)6 WritableColumnVector (org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector)3 RowType (org.apache.flink.table.types.logical.RowType)3 Path (org.apache.flink.core.fs.Path)2 OrcNoHiveShim (org.apache.flink.orc.nohive.shim.OrcNoHiveShim)2 ColumnVector (org.apache.flink.table.data.columnar.vector.ColumnVector)2 BigIntType (org.apache.flink.table.types.logical.BigIntType)2 IntType (org.apache.flink.table.types.logical.IntType)2 Configuration (org.apache.hadoop.conf.Configuration)2 IOException (java.io.IOException)1 BigDecimal (java.math.BigDecimal)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Date (java.sql.Date)1 LocalDate (java.time.LocalDate)1 LocalDateTime (java.time.LocalDateTime)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 List (java.util.List)1 Map (java.util.Map)1