Search in sources :

Example 1 with VectorizedRowBatch

use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project flink by apache.

the class OrcNoHiveColumnarRowInputFormat method createPartitionedFormat.

/**
 * Create a partitioned {@link OrcColumnarRowInputFormat}, the partition columns can be
 * generated by split.
 */
public static <SplitT extends FileSourceSplit> OrcColumnarRowInputFormat<VectorizedRowBatch, SplitT> createPartitionedFormat(Configuration hadoopConfig, RowType tableType, List<String> partitionKeys, PartitionFieldExtractor<SplitT> extractor, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Function<RowType, TypeInformation<RowData>> rowTypeInfoFactory) {
    // TODO FLINK-25113 all this partition keys code should be pruned from the orc format,
    // because now FileSystemTableSource uses FileInfoExtractorBulkFormat for reading partition
    // keys.
    String[] tableFieldNames = tableType.getFieldNames().toArray(new String[0]);
    LogicalType[] tableFieldTypes = tableType.getChildren().toArray(new LogicalType[0]);
    List<String> orcFieldNames = getNonPartNames(tableFieldNames, partitionKeys);
    int[] orcSelectedFields = getSelectedOrcFields(tableFieldNames, selectedFields, orcFieldNames);
    ColumnBatchFactory<VectorizedRowBatch, SplitT> batchGenerator = (SplitT split, VectorizedRowBatch rowBatch) -> {
        // create and initialize the row batch
        ColumnVector[] vectors = new ColumnVector[selectedFields.length];
        for (int i = 0; i < vectors.length; i++) {
            String name = tableFieldNames[selectedFields[i]];
            LogicalType type = tableFieldTypes[selectedFields[i]];
            vectors[i] = partitionKeys.contains(name) ? createFlinkVectorFromConstant(type, extractor.extract(split, name, type), batchSize) : createFlinkVector(rowBatch.cols[orcFieldNames.indexOf(name)]);
        }
        return new VectorizedColumnBatch(vectors);
    };
    return new OrcColumnarRowInputFormat<>(new OrcNoHiveShim(), hadoopConfig, convertToOrcTypeWithPart(tableFieldNames, tableFieldTypes, partitionKeys), orcSelectedFields, conjunctPredicates, batchSize, batchGenerator, rowTypeInfoFactory.apply(new RowType(Arrays.stream(selectedFields).mapToObj(i -> tableType.getFields().get(i)).collect(Collectors.toList()))));
}
Also used : LogicalType(org.apache.flink.table.types.logical.LogicalType) RowType(org.apache.flink.table.types.logical.RowType) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) OrcColumnarRowInputFormat(org.apache.flink.orc.OrcColumnarRowInputFormat) OrcNoHiveShim(org.apache.flink.orc.nohive.shim.OrcNoHiveShim)

Example 2 with VectorizedRowBatch

use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project flink by apache.

the class OrcNoHiveBulkWriterFactory method create.

@Override
public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
    OrcFile.WriterOptions opts = OrcFile.writerOptions(new Properties(), conf);
    TypeDescription description = TypeDescription.fromString(schema);
    opts.setSchema(description);
    opts.physicalWriter(new NoHivePhysicalWriterImpl(out, opts));
    WriterImpl writer = new WriterImpl(null, new Path("."), opts);
    VectorizedRowBatch rowBatch = description.createRowBatch();
    return new BulkWriter<RowData>() {

        @Override
        public void addElement(RowData row) throws IOException {
            int rowId = rowBatch.size++;
            for (int i = 0; i < row.getArity(); ++i) {
                setColumn(rowId, rowBatch.cols[i], fieldTypes[i], row, i);
            }
            if (rowBatch.size == rowBatch.getMaxSize()) {
                writer.addRowBatch(rowBatch);
                rowBatch.reset();
            }
        }

        @Override
        public void flush() throws IOException {
            if (rowBatch.size != 0) {
                writer.addRowBatch(rowBatch);
                rowBatch.reset();
            }
        }

        @Override
        public void finish() throws IOException {
            flush();
            writer.close();
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) RowData(org.apache.flink.table.data.RowData) NoHivePhysicalWriterImpl(org.apache.flink.orc.nohive.writer.NoHivePhysicalWriterImpl) OrcFile(org.apache.orc.OrcFile) BulkWriter(org.apache.flink.api.common.serialization.BulkWriter) TypeDescription(org.apache.orc.TypeDescription) Properties(java.util.Properties) NoHivePhysicalWriterImpl(org.apache.flink.orc.nohive.writer.NoHivePhysicalWriterImpl) WriterImpl(org.apache.orc.impl.WriterImpl)

Example 3 with VectorizedRowBatch

use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project flink by apache.

the class OrcColumnarRowSplitReaderNoHiveTest method prepareReadFileWithTypes.

@Override
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
    // NOTE: orc has field name information, so name should be same as orc
    TypeDescription schema = TypeDescription.fromString("struct<" + "f0:float," + "f1:double," + "f2:timestamp," + "f3:tinyint," + "f4:smallint" + ">");
    org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
    Configuration conf = new Configuration();
    Writer writer = OrcFile.createWriter(filePath, OrcFile.writerOptions(conf).setSchema(schema));
    VectorizedRowBatch batch = schema.createRowBatch(rowSize);
    DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
    DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
    TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
    LongColumnVector col3 = (LongColumnVector) batch.cols[3];
    LongColumnVector col4 = (LongColumnVector) batch.cols[4];
    col0.noNulls = false;
    col1.noNulls = false;
    col2.noNulls = false;
    col3.noNulls = false;
    col4.noNulls = false;
    for (int i = 0; i < rowSize - 1; i++) {
        col0.vector[i] = i;
        col1.vector[i] = i;
        Timestamp timestamp = toTimestamp(i);
        col2.time[i] = timestamp.getTime();
        col2.nanos[i] = timestamp.getNanos();
        col3.vector[i] = i;
        col4.vector[i] = i;
    }
    col0.isNull[rowSize - 1] = true;
    col1.isNull[rowSize - 1] = true;
    col2.isNull[rowSize - 1] = true;
    col3.isNull[rowSize - 1] = true;
    col4.isNull[rowSize - 1] = true;
    batch.size = rowSize;
    writer.addRowBatch(batch);
    batch.reset();
    writer.close();
}
Also used : TimestampColumnVector(org.apache.orc.storage.ql.exec.vector.TimestampColumnVector) DoubleColumnVector(org.apache.orc.storage.ql.exec.vector.DoubleColumnVector) Configuration(org.apache.hadoop.conf.Configuration) Timestamp(java.sql.Timestamp) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) TypeDescription(org.apache.orc.TypeDescription) Writer(org.apache.orc.Writer) LongColumnVector(org.apache.orc.storage.ql.exec.vector.LongColumnVector)

Example 4 with VectorizedRowBatch

use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project incubator-gobblin by apache.

the class GenericRecordToOrcValueWriterTest method deserializeOrcRecords.

public static final List<Writable> deserializeOrcRecords(Path orcFilePath, FileSystem fs) throws IOException {
    org.apache.orc.Reader fileReader = OrcFile.createReader(orcFilePath, new OrcFile.ReaderOptions(new Configuration()));
    RecordReader recordReader = fileReader.rows();
    TypeDescription schema = fileReader.getSchema();
    VectorizedRowBatch batch = schema.createRowBatch();
    recordReader.nextBatch(batch);
    int rowInBatch = 0;
    // result container
    List<Writable> orcRecords = new ArrayList<>();
    long rowCount = fileReader.getNumberOfRows();
    while (rowCount > 0) {
        // Deserialize records using Mapreduce-like API
        if (schema.getCategory() == TypeDescription.Category.STRUCT) {
            OrcStruct result = (OrcStruct) OrcStruct.createValue(fileReader.getSchema());
            List<TypeDescription> children = schema.getChildren();
            int numberOfChildren = children.size();
            for (int i = 0; i < numberOfChildren; ++i) {
                result.setFieldValue(i, nextValue(batch.cols[i], rowInBatch, children.get(i), result.getFieldValue(i)));
            }
            orcRecords.add(result);
        } else {
            throw new UnsupportedOperationException("The serialized records have to be a struct in the outer-most layer.");
        }
        rowCount -= 1;
        rowInBatch += 1;
    }
    return orcRecords;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) RecordReader(org.apache.orc.RecordReader) ArrayList(java.util.ArrayList) Writable(org.apache.hadoop.io.Writable) IntWritable(org.apache.hadoop.io.IntWritable) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) OrcStruct(org.apache.orc.mapred.OrcStruct) OrcFile(org.apache.orc.OrcFile) TypeDescription(org.apache.orc.TypeDescription)

Example 5 with VectorizedRowBatch

use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project flink by apache.

the class OrcNoHiveSplitReaderUtil method genPartColumnarRowReader.

/**
 * Util for generating partitioned {@link OrcColumnarRowSplitReader}.
 */
public static OrcColumnarRowSplitReader<VectorizedRowBatch> genPartColumnarRowReader(Configuration conf, String[] fullFieldNames, DataType[] fullFieldTypes, Map<String, Object> partitionSpec, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, int batchSize, Path path, long splitStart, long splitLength) throws IOException {
    List<String> nonPartNames = getNonPartNames(fullFieldNames, partitionSpec);
    int[] selectedOrcFields = getSelectedOrcFields(fullFieldNames, selectedFields, nonPartNames);
    OrcColumnarRowSplitReader.ColumnBatchGenerator<VectorizedRowBatch> gen = (VectorizedRowBatch rowBatch) -> {
        // create and initialize the row batch
        ColumnVector[] vectors = new ColumnVector[selectedFields.length];
        for (int i = 0; i < vectors.length; i++) {
            String name = fullFieldNames[selectedFields[i]];
            LogicalType type = fullFieldTypes[selectedFields[i]].getLogicalType();
            vectors[i] = partitionSpec.containsKey(name) ? createFlinkVectorFromConstant(type, partitionSpec.get(name), batchSize) : createFlinkVector(rowBatch.cols[nonPartNames.indexOf(name)]);
        }
        return new VectorizedColumnBatch(vectors);
    };
    return new OrcColumnarRowSplitReader<>(new OrcNoHiveShim(), conf, convertToOrcTypeWithPart(fullFieldNames, fullFieldTypes, partitionSpec.keySet()), selectedOrcFields, gen, conjunctPredicates, batchSize, path, splitStart, splitLength);
}
Also used : VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) VectorizedColumnBatch(org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch) OrcNoHiveShim(org.apache.flink.orc.nohive.shim.OrcNoHiveShim) LogicalType(org.apache.flink.table.types.logical.LogicalType) OrcColumnarRowSplitReader(org.apache.flink.orc.OrcColumnarRowSplitReader)

Aggregations

VectorizedRowBatch (org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch)8 TypeDescription (org.apache.orc.TypeDescription)6 Configuration (org.apache.hadoop.conf.Configuration)4 OrcFile (org.apache.orc.OrcFile)4 Properties (java.util.Properties)3 Schema (org.apache.avro.Schema)3 GenericRecord (org.apache.avro.generic.GenericRecord)3 Path (org.apache.hadoop.fs.Path)3 IntWritable (org.apache.hadoop.io.IntWritable)3 Writable (org.apache.hadoop.io.Writable)3 Writer (org.apache.orc.Writer)3 Test (org.testng.annotations.Test)3 File (java.io.File)2 OrcNoHiveShim (org.apache.flink.orc.nohive.shim.OrcNoHiveShim)2 VectorizedColumnBatch (org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch)2 LogicalType (org.apache.flink.table.types.logical.LogicalType)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Timestamp (java.sql.Timestamp)1 ArrayList (java.util.ArrayList)1 BulkWriter (org.apache.flink.api.common.serialization.BulkWriter)1