Search in sources :

Example 1 with HeapBytesVector

use of org.apache.flink.table.data.columnar.vector.heap.HeapBytesVector in project flink by apache.

the class ParquetSplitReaderUtil method createVectorFromConstant.

public static ColumnVector createVectorFromConstant(LogicalType type, Object value, int batchSize) {
    switch(type.getTypeRoot()) {
        case CHAR:
        case VARCHAR:
        case BINARY:
        case VARBINARY:
            HeapBytesVector bsv = new HeapBytesVector(batchSize);
            if (value == null) {
                bsv.fillWithNulls();
            } else {
                bsv.fill(value instanceof byte[] ? (byte[]) value : value.toString().getBytes(StandardCharsets.UTF_8));
            }
            return bsv;
        case BOOLEAN:
            HeapBooleanVector bv = new HeapBooleanVector(batchSize);
            if (value == null) {
                bv.fillWithNulls();
            } else {
                bv.fill((boolean) value);
            }
            return bv;
        case TINYINT:
            HeapByteVector byteVector = new HeapByteVector(batchSize);
            if (value == null) {
                byteVector.fillWithNulls();
            } else {
                byteVector.fill(((Number) value).byteValue());
            }
            return byteVector;
        case SMALLINT:
            HeapShortVector sv = new HeapShortVector(batchSize);
            if (value == null) {
                sv.fillWithNulls();
            } else {
                sv.fill(((Number) value).shortValue());
            }
            return sv;
        case INTEGER:
            HeapIntVector iv = new HeapIntVector(batchSize);
            if (value == null) {
                iv.fillWithNulls();
            } else {
                iv.fill(((Number) value).intValue());
            }
            return iv;
        case BIGINT:
            HeapLongVector lv = new HeapLongVector(batchSize);
            if (value == null) {
                lv.fillWithNulls();
            } else {
                lv.fill(((Number) value).longValue());
            }
            return lv;
        case DECIMAL:
            DecimalType decimalType = (DecimalType) type;
            int precision = decimalType.getPrecision();
            int scale = decimalType.getScale();
            DecimalData decimal = value == null ? null : Preconditions.checkNotNull(DecimalData.fromBigDecimal((BigDecimal) value, precision, scale));
            ColumnVector internalVector;
            if (ParquetSchemaConverter.is32BitDecimal(precision)) {
                internalVector = createVectorFromConstant(new IntType(), decimal == null ? null : (int) decimal.toUnscaledLong(), batchSize);
            } else if (ParquetSchemaConverter.is64BitDecimal(precision)) {
                internalVector = createVectorFromConstant(new BigIntType(), decimal == null ? null : decimal.toUnscaledLong(), batchSize);
            } else {
                internalVector = createVectorFromConstant(new VarBinaryType(), decimal == null ? null : decimal.toUnscaledBytes(), batchSize);
            }
            return new ParquetDecimalVector(internalVector);
        case FLOAT:
            HeapFloatVector fv = new HeapFloatVector(batchSize);
            if (value == null) {
                fv.fillWithNulls();
            } else {
                fv.fill(((Number) value).floatValue());
            }
            return fv;
        case DOUBLE:
            HeapDoubleVector dv = new HeapDoubleVector(batchSize);
            if (value == null) {
                dv.fillWithNulls();
            } else {
                dv.fill(((Number) value).doubleValue());
            }
            return dv;
        case DATE:
            if (value instanceof LocalDate) {
                value = Date.valueOf((LocalDate) value);
            }
            return createVectorFromConstant(new IntType(), value == null ? null : toInternal((Date) value), batchSize);
        case TIMESTAMP_WITHOUT_TIME_ZONE:
            HeapTimestampVector tv = new HeapTimestampVector(batchSize);
            if (value == null) {
                tv.fillWithNulls();
            } else {
                tv.fill(TimestampData.fromLocalDateTime((LocalDateTime) value));
            }
            return tv;
        default:
            throw new UnsupportedOperationException("Unsupported type: " + type);
    }
}
Also used : HeapShortVector(org.apache.flink.table.data.columnar.vector.heap.HeapShortVector) HeapLongVector(org.apache.flink.table.data.columnar.vector.heap.HeapLongVector) LocalDateTime(java.time.LocalDateTime) VarBinaryType(org.apache.flink.table.types.logical.VarBinaryType) HeapByteVector(org.apache.flink.table.data.columnar.vector.heap.HeapByteVector) HeapDoubleVector(org.apache.flink.table.data.columnar.vector.heap.HeapDoubleVector) HeapTimestampVector(org.apache.flink.table.data.columnar.vector.heap.HeapTimestampVector) HeapBytesVector(org.apache.flink.table.data.columnar.vector.heap.HeapBytesVector) HeapIntVector(org.apache.flink.table.data.columnar.vector.heap.HeapIntVector) BigIntType(org.apache.flink.table.types.logical.BigIntType) LocalDate(java.time.LocalDate) HeapBooleanVector(org.apache.flink.table.data.columnar.vector.heap.HeapBooleanVector) ColumnVector(org.apache.flink.table.data.columnar.vector.ColumnVector) WritableColumnVector(org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector) IntType(org.apache.flink.table.types.logical.IntType) BigIntType(org.apache.flink.table.types.logical.BigIntType) DecimalData(org.apache.flink.table.data.DecimalData) DecimalType(org.apache.flink.table.types.logical.DecimalType) HeapFloatVector(org.apache.flink.table.data.columnar.vector.heap.HeapFloatVector)

Example 2 with HeapBytesVector

use of org.apache.flink.table.data.columnar.vector.heap.HeapBytesVector in project flink by apache.

the class ColumnVectorTest method testBytes.

@Test
public void testBytes() {
    HeapBytesVector vector = new HeapBytesVector(SIZE);
    for (int i = 0; i < SIZE; i++) {
        byte[] bytes = produceBytes(i);
        vector.appendBytes(i, bytes, 0, bytes.length);
    }
    for (int i = 0; i < SIZE; i++) {
        assertArrayEquals(produceBytes(i), vector.getBytes(i).getBytes());
    }
    vector.reset();
    for (int i = 0; i < SIZE; i++) {
        byte[] bytes = produceBytes(i);
        vector.appendBytes(i, bytes, 0, bytes.length);
    }
    for (int i = 0; i < SIZE; i++) {
        assertArrayEquals(produceBytes(i), vector.getBytes(i).getBytes());
    }
    vector.fill(produceBytes(22));
    for (int i = 0; i < SIZE; i++) {
        assertArrayEquals(produceBytes(22), vector.getBytes(i).getBytes());
    }
    vector.setDictionary(new TestDictionary(IntStream.range(0, SIZE).mapToObj(this::produceBytes).toArray()));
    setRangeDictIds(vector);
    for (int i = 0; i < SIZE; i++) {
        assertArrayEquals(produceBytes(i), vector.getBytes(i).getBytes());
    }
}
Also used : HeapBytesVector(org.apache.flink.table.data.columnar.vector.heap.HeapBytesVector) Test(org.junit.Test)

Example 3 with HeapBytesVector

use of org.apache.flink.table.data.columnar.vector.heap.HeapBytesVector in project flink by apache.

the class VectorizedColumnBatchTest method testTyped.

@Test
public void testTyped() throws IOException {
    HeapBooleanVector col0 = new HeapBooleanVector(VECTOR_SIZE);
    for (int i = 0; i < VECTOR_SIZE; i++) {
        col0.vector[i] = i % 2 == 0;
    }
    HeapBytesVector col1 = new HeapBytesVector(VECTOR_SIZE);
    for (int i = 0; i < VECTOR_SIZE; i++) {
        byte[] bytes = String.valueOf(i).getBytes(StandardCharsets.UTF_8);
        col1.appendBytes(i, bytes, 0, bytes.length);
    }
    HeapByteVector col2 = new HeapByteVector(VECTOR_SIZE);
    for (int i = 0; i < VECTOR_SIZE; i++) {
        col2.vector[i] = (byte) i;
    }
    HeapDoubleVector col3 = new HeapDoubleVector(VECTOR_SIZE);
    for (int i = 0; i < VECTOR_SIZE; i++) {
        col3.vector[i] = i;
    }
    HeapFloatVector col4 = new HeapFloatVector(VECTOR_SIZE);
    for (int i = 0; i < VECTOR_SIZE; i++) {
        col4.vector[i] = i;
    }
    HeapIntVector col5 = new HeapIntVector(VECTOR_SIZE);
    for (int i = 0; i < VECTOR_SIZE; i++) {
        col5.vector[i] = i;
    }
    HeapLongVector col6 = new HeapLongVector(VECTOR_SIZE);
    for (int i = 0; i < VECTOR_SIZE; i++) {
        col6.vector[i] = i;
    }
    HeapShortVector col7 = new HeapShortVector(VECTOR_SIZE);
    for (int i = 0; i < VECTOR_SIZE; i++) {
        col7.vector[i] = (short) i;
    }
    // The precision of Timestamp in parquet should be one of MILLIS, MICROS or NANOS.
    // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#timestamp
    // 
    // For MILLIS, the underlying INT64 holds milliseconds
    // For MICROS, the underlying INT64 holds microseconds
    // For NANOS, the underlying INT96 holds nanoOfDay(8 bytes) and julianDay(4 bytes)
    long[] vector8 = new long[VECTOR_SIZE];
    for (int i = 0; i < VECTOR_SIZE; i++) {
        vector8[i] = i;
    }
    TimestampColumnVector col8 = new TimestampColumnVector() {

        @Override
        public boolean isNullAt(int i) {
            return false;
        }

        @Override
        public TimestampData getTimestamp(int i, int precision) {
            return TimestampData.fromEpochMillis(vector8[i]);
        }
    };
    long[] vector9 = new long[VECTOR_SIZE];
    for (int i = 0; i < VECTOR_SIZE; i++) {
        vector9[i] = i * 1000;
    }
    TimestampColumnVector col9 = new TimestampColumnVector() {

        @Override
        public TimestampData getTimestamp(int i, int precision) {
            long microseconds = vector9[i];
            return TimestampData.fromEpochMillis(microseconds / 1000, (int) (microseconds % 1000) * 1000);
        }

        @Override
        public boolean isNullAt(int i) {
            return false;
        }
    };
    HeapBytesVector vector10 = new HeapBytesVector(VECTOR_SIZE);
    {
        int nanosecond = 123456789;
        int start = 0;
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        for (int i = 0; i < VECTOR_SIZE; i++) {
            byte[] bytes = new byte[12];
            // i means second
            long l = i * 1000000000L + nanosecond;
            for (int j = 0; j < 8; j++) {
                bytes[7 - j] = (byte) l;
                l >>>= 8;
            }
            // Epoch Julian
            int n = 2440588;
            for (int j = 0; j < 4; j++) {
                bytes[11 - j] = (byte) n;
                n >>>= 8;
            }
            vector10.start[i] = start;
            vector10.length[i] = 12;
            start += 12;
            out.write(bytes);
        }
        vector10.buffer = out.toByteArray();
    }
    TimestampColumnVector col10 = new TimestampColumnVector() {

        @Override
        public TimestampData getTimestamp(int colId, int precision) {
            byte[] bytes = vector10.getBytes(colId).getBytes();
            assert bytes.length == 12;
            long nanoOfDay = 0;
            for (int i = 0; i < 8; i++) {
                nanoOfDay <<= 8;
                nanoOfDay |= (bytes[i] & (0xff));
            }
            int julianDay = 0;
            for (int i = 8; i < 12; i++) {
                julianDay <<= 8;
                julianDay |= (bytes[i] & (0xff));
            }
            long millisecond = (julianDay - DateTimeUtils.EPOCH_JULIAN) * DateTimeUtils.MILLIS_PER_DAY + nanoOfDay / 1000000;
            int nanoOfMillisecond = (int) (nanoOfDay % 1000000);
            return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond);
        }

        @Override
        public boolean isNullAt(int i) {
            return false;
        }
    };
    long[] vector11 = new long[VECTOR_SIZE];
    DecimalColumnVector col11 = new DecimalColumnVector() {

        @Override
        public boolean isNullAt(int i) {
            return false;
        }

        @Override
        public DecimalData getDecimal(int i, int precision, int scale) {
            return DecimalData.fromUnscaledLong(vector11[i], precision, scale);
        }
    };
    for (int i = 0; i < VECTOR_SIZE; i++) {
        vector11[i] = i;
    }
    HeapIntVector col12Data = new HeapIntVector(VECTOR_SIZE * ARRAY_SIZE);
    for (int i = 0; i < VECTOR_SIZE * ARRAY_SIZE; i++) {
        col12Data.vector[i] = i;
    }
    ArrayColumnVector col12 = new ArrayColumnVector() {

        @Override
        public boolean isNullAt(int i) {
            return false;
        }

        @Override
        public ArrayData getArray(int i) {
            return new ColumnarArrayData(col12Data, i * ARRAY_SIZE, ARRAY_SIZE);
        }
    };
    VectorizedColumnBatch batch = new VectorizedColumnBatch(new ColumnVector[] { col0, col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12 });
    batch.setNumRows(VECTOR_SIZE);
    for (int i = 0; i < batch.getNumRows(); i++) {
        ColumnarRowData row = new ColumnarRowData(batch, i);
        assertEquals(row.getBoolean(0), i % 2 == 0);
        assertEquals(row.getString(1).toString(), String.valueOf(i));
        assertEquals(row.getByte(2), (byte) i);
        assertEquals(row.getDouble(3), i, 0);
        assertEquals(row.getFloat(4), (float) i, 0);
        assertEquals(row.getInt(5), i);
        assertEquals(row.getLong(6), i);
        assertEquals(row.getShort(7), (short) i);
        assertEquals(row.getTimestamp(8, 3).getMillisecond(), i);
        assertEquals(row.getTimestamp(9, 6).getMillisecond(), i);
        assertEquals(row.getTimestamp(10, 9).getMillisecond(), i * 1000L + 123);
        assertEquals(row.getTimestamp(10, 9).getNanoOfMillisecond(), 456789);
        assertEquals(row.getDecimal(11, 10, 0).toUnscaledLong(), i);
        for (int j = 0; j < ARRAY_SIZE; j++) {
            assertEquals(row.getArray(12).getInt(j), i * ARRAY_SIZE + j);
        }
    }
    assertEquals(VECTOR_SIZE, batch.getNumRows());
}
Also used : HeapLongVector(org.apache.flink.table.data.columnar.vector.heap.HeapLongVector) HeapShortVector(org.apache.flink.table.data.columnar.vector.heap.HeapShortVector) HeapByteVector(org.apache.flink.table.data.columnar.vector.heap.HeapByteVector) HeapDoubleVector(org.apache.flink.table.data.columnar.vector.heap.HeapDoubleVector) ColumnarArrayData(org.apache.flink.table.data.columnar.ColumnarArrayData) HeapBytesVector(org.apache.flink.table.data.columnar.vector.heap.HeapBytesVector) HeapIntVector(org.apache.flink.table.data.columnar.vector.heap.HeapIntVector) ByteArrayOutputStream(java.io.ByteArrayOutputStream) HeapBooleanVector(org.apache.flink.table.data.columnar.vector.heap.HeapBooleanVector) ColumnarRowData(org.apache.flink.table.data.columnar.ColumnarRowData) HeapFloatVector(org.apache.flink.table.data.columnar.vector.heap.HeapFloatVector) Test(org.junit.Test)

Aggregations

HeapBytesVector (org.apache.flink.table.data.columnar.vector.heap.HeapBytesVector)3 HeapBooleanVector (org.apache.flink.table.data.columnar.vector.heap.HeapBooleanVector)2 HeapByteVector (org.apache.flink.table.data.columnar.vector.heap.HeapByteVector)2 HeapDoubleVector (org.apache.flink.table.data.columnar.vector.heap.HeapDoubleVector)2 HeapFloatVector (org.apache.flink.table.data.columnar.vector.heap.HeapFloatVector)2 HeapIntVector (org.apache.flink.table.data.columnar.vector.heap.HeapIntVector)2 HeapLongVector (org.apache.flink.table.data.columnar.vector.heap.HeapLongVector)2 HeapShortVector (org.apache.flink.table.data.columnar.vector.heap.HeapShortVector)2 Test (org.junit.Test)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 LocalDate (java.time.LocalDate)1 LocalDateTime (java.time.LocalDateTime)1 DecimalData (org.apache.flink.table.data.DecimalData)1 ColumnarArrayData (org.apache.flink.table.data.columnar.ColumnarArrayData)1 ColumnarRowData (org.apache.flink.table.data.columnar.ColumnarRowData)1 ColumnVector (org.apache.flink.table.data.columnar.vector.ColumnVector)1 HeapTimestampVector (org.apache.flink.table.data.columnar.vector.heap.HeapTimestampVector)1 WritableColumnVector (org.apache.flink.table.data.columnar.vector.writable.WritableColumnVector)1 BigIntType (org.apache.flink.table.types.logical.BigIntType)1 DecimalType (org.apache.flink.table.types.logical.DecimalType)1