Search in sources :

Example 1 with OrcNoHiveShim

use of org.apache.flink.orc.nohive.shim.OrcNoHiveShim in project flink by apache.

the class OrcNoHiveColumnarRowInputFormat method createPartitionedFormat.

/**
 * Create a partitioned {@link OrcColumnarRowInputFormat}, the partition columns can be
 * generated by split.
 */
public static <SplitT extends FileSourceSplit> OrcColumnarRowInputFormat<VectorizedRowBatch, SplitT> createPartitionedFormat(Configuration hadoopConfig, RowType tableType, List<String> partitionKeys, PartitionFieldExtractor<SplitT> extractor, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Function<RowType, TypeInformation<RowData>> rowTypeInfoFactory) {
    // TODO FLINK-25113 all this partition keys code should be pruned from the orc format,
    // because now FileSystemTableSource uses FileInfoExtractorBulkFormat for reading partition
    // keys.
    String[] tableFieldNames = tableType.getFieldNames().toArray(new String[0]);
    LogicalType[] tableFieldTypes = tableType.getChildren().toArray(new LogicalType[0]);
    List<String> orcFieldNames = getNonPartNames(tableFieldNames, partitionKeys);
    int[] orcSelectedFields = getSelectedOrcFields(tableFieldNames, selectedFields, orcFieldNames);
    ColumnBatchFactory<VectorizedRowBatch, SplitT> batchGenerator = (SplitT split, VectorizedRowBatch rowBatch) -> {
        // create and initialize the row batch
        ColumnVector[] vectors = new ColumnVector[selectedFields.length];
        for (int i = 0; i < vectors.length; i++) {
            String name = tableFieldNames[selectedFields[i]];
            LogicalType type = tableFieldTypes[selectedFields[i]];
            vectors[i] = partitionKeys.contains(name) ? createFlinkVectorFromConstant(type, extractor.extract(split, name, type), batchSize) : createFlinkVector(rowBatch.cols[orcFieldNames.indexOf(name)]);
        }
        return new VectorizedColumnBatch(vectors);
    };
    return new OrcColumnarRowInputFormat<>(new OrcNoHiveShim(), hadoopConfig, convertToOrcTypeWithPart(tableFieldNames, tableFieldTypes, partitionKeys), orcSelectedFields, conjunctPredicates, batchSize, batchGenerator, rowTypeInfoFactory.apply(new RowType(Arrays.stream(selectedFields).mapToObj(i -> tableType.getFields().get(i)).collect(Collectors.toList()))));
}
Also used : LogicalType(org.apache.flink.table.types.logical.LogicalType) RowType(org.apache.flink.table.types.logical.RowType) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) OrcColumnarRowInputFormat(org.apache.flink.orc.OrcColumnarRowInputFormat) OrcNoHiveShim(org.apache.flink.orc.nohive.shim.OrcNoHiveShim)

Example 2 with OrcNoHiveShim

use of org.apache.flink.orc.nohive.shim.OrcNoHiveShim in project flink by apache.

the class OrcNoHiveSplitReaderUtil method genPartColumnarRowReader.

/**
 * Util for generating partitioned {@link OrcColumnarRowSplitReader}.
 */
public static OrcColumnarRowSplitReader<VectorizedRowBatch> genPartColumnarRowReader(Configuration conf, String[] fullFieldNames, DataType[] fullFieldTypes, Map<String, Object> partitionSpec, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
    List<String> nonPartNames = getNonPartNames(fullFieldNames, partitionSpec);
    int[] selectedOrcFields = getSelectedOrcFields(fullFieldNames, selectedFields, nonPartNames);
    OrcColumnarRowSplitReader.ColumnBatchGenerator<VectorizedRowBatch> gen = (VectorizedRowBatch rowBatch) -> {
        // create and initialize the row batch
        ColumnVector[] vectors = new ColumnVector[selectedFields.length];
        for (int i = 0; i < vectors.length; i++) {
            String name = fullFieldNames[selectedFields[i]];
            LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType();
            vectors[i] = partitionSpec.containsKey(name) ? createFlinkVectorFromConstant(type, partitionSpec.get(name), batchSize) : createFlinkVector(rowBatch.cols[nonPartNames.indexOf(name)]);
        }
        return new VectorizedColumnBatch(vectors);
    };
    return new OrcColumnarRowSplitReader<>(new OrcNoHiveShim(), conf, convertToOrcTypeWithPart(fullFieldNames, fullFieldTypes, partitionSpec.keySet()), selectedOrcFields, gen, conjunctPredicates, batchSize, path, splitStart, splitLength);
}
Also used : VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) OrcNoHiveShim(org.apache.flink.orc.nohive.shim.OrcNoHiveShim) LogicalType(org.apache.flink.table.types.logical.LogicalType) OrcColumnarRowSplitReader(org.apache.flink.orc.OrcColumnarRowSplitReader)

Aggregations

OrcNoHiveShim (org.apache.flink.orc.nohive.shim.OrcNoHiveShim)2 VectorizedColumnBatch (org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch)2 LogicalType (org.apache.flink.table.types.logical.LogicalType)2 VectorizedRowBatch (org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch)2 OrcColumnarRowInputFormat (org.apache.flink.orc.OrcColumnarRowInputFormat)1 OrcColumnarRowSplitReader (org.apache.flink.orc.OrcColumnarRowSplitReader)1 RowType (org.apache.flink.table.types.logical.RowType)1