Search in sources :

Example 1 with VarCharType

use of org.apache.flink.table.types.logical.VarCharType in project flink by apache.

the class OrcSplitReaderUtil method logicalTypeToOrcType.

/**
 * See {@code org.apache.flink.table.catalog.hive.util.HiveTypeUtil}.
 */
public static TypeDescription logicalTypeToOrcType(LogicalType type) {
    type = type.copy(true);
    switch(type.getTypeRoot()) {
        case CHAR:
            return TypeDescription.createChar().withMaxLength(((CharType) type).getLength());
        case VARCHAR:
            int len = ((VarCharType) type).getLength();
            if (len == VarCharType.MAX_LENGTH) {
                return TypeDescription.createString();
            } else {
                return TypeDescription.createVarchar().withMaxLength(len);
            }
        case BOOLEAN:
            return TypeDescription.createBoolean();
        case VARBINARY:
            if (type.equals(DataTypes.BYTES().getLogicalType())) {
                return TypeDescription.createBinary();
            } else {
                throw new UnsupportedOperationException("Not support other binary type: " + type);
            }
        case DECIMAL:
            DecimalType decimalType = (DecimalType) type;
            return TypeDescription.createDecimal().withScale(decimalType.getScale()).withPrecision(decimalType.getPrecision());
        case TINYINT:
            return TypeDescription.createByte();
        case SMALLINT:
            return TypeDescription.createShort();
        case INTEGER:
            return TypeDescription.createInt();
        case BIGINT:
            return TypeDescription.createLong();
        case FLOAT:
            return TypeDescription.createFloat();
        case DOUBLE:
            return TypeDescription.createDouble();
        case DATE:
            return TypeDescription.createDate();
        case TIMESTAMP_WITHOUT_TIME_ZONE:
            return TypeDescription.createTimestamp();
        case ARRAY:
            ArrayType arrayType = (ArrayType) type;
            return TypeDescription.createList(logicalTypeToOrcType(arrayType.getElementType()));
        case MAP:
            MapType mapType = (MapType) type;
            return TypeDescription.createMap(logicalTypeToOrcType(mapType.getKeyType()), logicalTypeToOrcType(mapType.getValueType()));
        case ROW:
            RowType rowType = (RowType) type;
            TypeDescription struct = TypeDescription.createStruct();
            for (int i = 0; i < rowType.getFieldCount(); i++) {
                struct.addField(rowType.getFieldNames().get(i), logicalTypeToOrcType(rowType.getChildren().get(i)));
            }
            return struct;
        default:
            throw new UnsupportedOperationException("Unsupported type: " + type);
    }
}
Also used : ArrayType(org.apache.flink.table.types.logical.ArrayType) DecimalType(org.apache.flink.table.types.logical.DecimalType) RowType(org.apache.flink.table.types.logical.RowType) TypeDescription(org.apache.orc.TypeDescription) VarCharType(org.apache.flink.table.types.logical.VarCharType) MapType(org.apache.flink.table.types.logical.MapType)

Example 2 with VarCharType

use of org.apache.flink.table.types.logical.VarCharType in project flink by apache.

the class ParquetColumnarRowSplitReaderTest method innerTestPartitionValues.

private void innerTestPartitionValues(Path testPath, Map<String, Object> partSpec, boolean nullPartValue) throws IOException {
    LogicalType[] fieldTypes = new LogicalType[] { new VarCharType(VarCharType.MAX_LENGTH), new BooleanType(), new TinyIntType(), new SmallIntType(), new IntType(), new BigIntType(), new FloatType(), new DoubleType(), new TimestampType(9), new DecimalType(5, 0), new DecimalType(15, 0), new DecimalType(20, 0), new DecimalType(5, 0), new DecimalType(15, 0), new DecimalType(20, 0), new BooleanType(), new DateType(), new TimestampType(9), new DoubleType(), new TinyIntType(), new SmallIntType(), new IntType(), new BigIntType(), new FloatType(), new DecimalType(5, 0), new DecimalType(15, 0), new DecimalType(20, 0), new VarCharType(VarCharType.MAX_LENGTH) };
    ParquetColumnarRowSplitReader reader = ParquetSplitReaderUtil.genPartColumnarRowReader(false, true, new Configuration(), IntStream.range(0, 28).mapToObj(i -> "f" + i).toArray(String[]::new), Arrays.stream(fieldTypes).map(TypeConversions::fromLogicalToDataType).toArray(DataType[]::new), partSpec, new int[] { 7, 2, 4, 15, 19, 20, 21, 22, 23, 18, 16, 17, 24, 25, 26, 27 }, rowGroupSize, new Path(testPath.getPath()), 0, Long.MAX_VALUE);
    int i = 0;
    while (!reader.reachedEnd()) {
        ColumnarRowData row = reader.nextRecord();
        // common values
        assertEquals(i, row.getDouble(0), 0);
        assertEquals((byte) i, row.getByte(1));
        assertEquals(i, row.getInt(2));
        // partition values
        if (nullPartValue) {
            for (int j = 3; j < 16; j++) {
                assertTrue(row.isNullAt(j));
            }
        } else {
            assertTrue(row.getBoolean(3));
            assertEquals(9, row.getByte(4));
            assertEquals(10, row.getShort(5));
            assertEquals(11, row.getInt(6));
            assertEquals(12, row.getLong(7));
            assertEquals(13, row.getFloat(8), 0);
            assertEquals(6.6, row.getDouble(9), 0);
            assertEquals(DateTimeUtils.toInternal(Date.valueOf("2020-11-23")), row.getInt(10));
            assertEquals(LocalDateTime.of(1999, 1, 1, 1, 1), row.getTimestamp(11, 9).toLocalDateTime());
            assertEquals(DecimalData.fromBigDecimal(new BigDecimal(24), 5, 0), row.getDecimal(12, 5, 0));
            assertEquals(DecimalData.fromBigDecimal(new BigDecimal(25), 15, 0), row.getDecimal(13, 15, 0));
            assertEquals(DecimalData.fromBigDecimal(new BigDecimal(26), 20, 0), row.getDecimal(14, 20, 0));
            assertEquals("f27", row.getString(15).toString());
        }
        i++;
    }
    reader.close();
}
Also used : Path(org.apache.flink.core.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) TypeConversions(org.apache.flink.table.types.utils.TypeConversions) BooleanType(org.apache.flink.table.types.logical.BooleanType) LogicalType(org.apache.flink.table.types.logical.LogicalType) BigIntType(org.apache.flink.table.types.logical.BigIntType) BigDecimal(java.math.BigDecimal) TinyIntType(org.apache.flink.table.types.logical.TinyIntType) TinyIntType(org.apache.flink.table.types.logical.TinyIntType) IntType(org.apache.flink.table.types.logical.IntType) BigIntType(org.apache.flink.table.types.logical.BigIntType) SmallIntType(org.apache.flink.table.types.logical.SmallIntType) FloatType(org.apache.flink.table.types.logical.FloatType) SmallIntType(org.apache.flink.table.types.logical.SmallIntType) DoubleType(org.apache.flink.table.types.logical.DoubleType) TimestampType(org.apache.flink.table.types.logical.TimestampType) DecimalType(org.apache.flink.table.types.logical.DecimalType) DataType(org.apache.flink.table.types.DataType) ColumnarRowData(org.apache.flink.table.data.columnar.ColumnarRowData) VarCharType(org.apache.flink.table.types.logical.VarCharType) DateType(org.apache.flink.table.types.logical.DateType)

Example 3 with VarCharType

use of org.apache.flink.table.types.logical.VarCharType in project flink by apache.

the class ParquetColumnarRowInputFormatTest method innerTestPartitionValues.

private void innerTestPartitionValues(Path testPath, List<String> partitionKeys, boolean nullPartValue) throws IOException {
    LogicalType[] fieldTypes = new LogicalType[] { new VarCharType(VarCharType.MAX_LENGTH), new BooleanType(), new TinyIntType(), new SmallIntType(), new IntType(), new BigIntType(), new FloatType(), new DoubleType(), new TimestampType(9), new DecimalType(5, 0), new DecimalType(15, 0), new DecimalType(20, 0), new DecimalType(5, 0), new DecimalType(15, 0), new DecimalType(20, 0), new BooleanType(), new DateType(), new TimestampType(9), new DoubleType(), new TinyIntType(), new SmallIntType(), new IntType(), new BigIntType(), new FloatType(), new DecimalType(5, 0), new DecimalType(15, 0), new DecimalType(20, 0), new VarCharType(VarCharType.MAX_LENGTH) };
    RowType rowType = RowType.of(fieldTypes, IntStream.range(0, 28).mapToObj(i -> "f" + i).toArray(String[]::new));
    int[] projected = new int[] { 7, 2, 4, 15, 19, 20, 21, 22, 23, 18, 16, 17, 24, 25, 26, 27 };
    RowType producedType = new RowType(Arrays.stream(projected).mapToObj(i -> rowType.getFields().get(i)).collect(Collectors.toList()));
    ParquetColumnarRowInputFormat<FileSourceSplit> format = ParquetColumnarRowInputFormat.createPartitionedFormat(new Configuration(), producedType, InternalTypeInfo.of(producedType), partitionKeys, PartitionFieldExtractor.forFileSystem("my_default_value"), 500, false, true);
    FileStatus fileStatus = testPath.getFileSystem().getFileStatus(testPath);
    AtomicInteger cnt = new AtomicInteger(0);
    forEachRemaining(format.createReader(EMPTY_CONF, new FileSourceSplit("id", testPath, 0, Long.MAX_VALUE, fileStatus.getModificationTime(), fileStatus.getLen())), row -> {
        int i = cnt.get();
        // common values
        assertEquals(i, row.getDouble(0), 0);
        assertEquals((byte) i, row.getByte(1));
        assertEquals(i, row.getInt(2));
        // partition values
        if (nullPartValue) {
            for (int j = 3; j < 16; j++) {
                assertTrue(row.isNullAt(j));
            }
        } else {
            assertTrue(row.getBoolean(3));
            assertEquals(9, row.getByte(4));
            assertEquals(10, row.getShort(5));
            assertEquals(11, row.getInt(6));
            assertEquals(12, row.getLong(7));
            assertEquals(13, row.getFloat(8), 0);
            assertEquals(6.6, row.getDouble(9), 0);
            assertEquals(DateTimeUtils.toInternal(Date.valueOf("2020-11-23")), row.getInt(10));
            assertEquals(LocalDateTime.of(1999, 1, 1, 1, 1), row.getTimestamp(11, 9).toLocalDateTime());
            assertEquals(DecimalData.fromBigDecimal(new BigDecimal(24), 5, 0), row.getDecimal(12, 5, 0));
            assertEquals(DecimalData.fromBigDecimal(new BigDecimal(25), 15, 0), row.getDecimal(13, 15, 0));
            assertEquals(DecimalData.fromBigDecimal(new BigDecimal(26), 20, 0), row.getDecimal(14, 20, 0));
            assertEquals("f27", row.getString(15).toString());
        }
        cnt.incrementAndGet();
    });
}
Also used : FileStatus(org.apache.flink.core.fs.FileStatus) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) Configuration(org.apache.hadoop.conf.Configuration) BooleanType(org.apache.flink.table.types.logical.BooleanType) LogicalType(org.apache.flink.table.types.logical.LogicalType) BigIntType(org.apache.flink.table.types.logical.BigIntType) RowType(org.apache.flink.table.types.logical.RowType) BigDecimal(java.math.BigDecimal) TinyIntType(org.apache.flink.table.types.logical.TinyIntType) TinyIntType(org.apache.flink.table.types.logical.TinyIntType) IntType(org.apache.flink.table.types.logical.IntType) BigIntType(org.apache.flink.table.types.logical.BigIntType) SmallIntType(org.apache.flink.table.types.logical.SmallIntType) FloatType(org.apache.flink.table.types.logical.FloatType) SmallIntType(org.apache.flink.table.types.logical.SmallIntType) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) DoubleType(org.apache.flink.table.types.logical.DoubleType) TimestampType(org.apache.flink.table.types.logical.TimestampType) DecimalType(org.apache.flink.table.types.logical.DecimalType) VarCharType(org.apache.flink.table.types.logical.VarCharType) DateType(org.apache.flink.table.types.logical.DateType)

Example 4 with VarCharType

use of org.apache.flink.table.types.logical.VarCharType in project flink by apache.

the class ParquetColumnarRowInputFormatTest method testReadingSplit.

private int testReadingSplit(List<Integer> expected, Path path, long splitStart, long splitLength, long seekToRow) throws IOException {
    LogicalType[] fieldTypes = new LogicalType[] { new VarCharType(VarCharType.MAX_LENGTH), new BooleanType(), new TinyIntType(), new SmallIntType(), new IntType(), new BigIntType(), new FloatType(), new DoubleType(), new TimestampType(9), new DecimalType(5, 0), new DecimalType(15, 0), new DecimalType(20, 0), new DecimalType(5, 0), new DecimalType(15, 0), new DecimalType(20, 0) };
    ParquetColumnarRowInputFormat format = new ParquetColumnarRowInputFormat(new Configuration(), RowType.of(fieldTypes, new String[] { "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12", "f13", "f14" }), null, 500, false, true);
    // validate java serialization
    try {
        InstantiationUtil.clone(format);
    } catch (ClassNotFoundException e) {
        throw new IOException(e);
    }
    FileStatus fileStatus = path.getFileSystem().getFileStatus(path);
    BulkFormat.Reader<RowData> reader = format.restoreReader(EMPTY_CONF, new FileSourceSplit("id", path, splitStart, splitLength, fileStatus.getModificationTime(), fileStatus.getLen(), new String[0], new CheckpointedPosition(CheckpointedPosition.NO_OFFSET, seekToRow)));
    AtomicInteger cnt = new AtomicInteger(0);
    final AtomicReference<RowData> previousRow = new AtomicReference<>();
    forEachRemaining(reader, row -> {
        if (previousRow.get() == null) {
            previousRow.set(row);
        } else {
            // ParquetColumnarRowInputFormat should only have one row instance.
            assertSame(previousRow.get(), row);
        }
        Integer v = expected.get(cnt.get());
        if (v == null) {
            assertTrue(row.isNullAt(0));
            assertTrue(row.isNullAt(1));
            assertTrue(row.isNullAt(2));
            assertTrue(row.isNullAt(3));
            assertTrue(row.isNullAt(4));
            assertTrue(row.isNullAt(5));
            assertTrue(row.isNullAt(6));
            assertTrue(row.isNullAt(7));
            assertTrue(row.isNullAt(8));
            assertTrue(row.isNullAt(9));
            assertTrue(row.isNullAt(10));
            assertTrue(row.isNullAt(11));
            assertTrue(row.isNullAt(12));
            assertTrue(row.isNullAt(13));
            assertTrue(row.isNullAt(14));
        } else {
            assertEquals("" + v, row.getString(0).toString());
            assertEquals(v % 2 == 0, row.getBoolean(1));
            assertEquals(v.byteValue(), row.getByte(2));
            assertEquals(v.shortValue(), row.getShort(3));
            assertEquals(v.intValue(), row.getInt(4));
            assertEquals(v.longValue(), row.getLong(5));
            assertEquals(v.floatValue(), row.getFloat(6), 0);
            assertEquals(v.doubleValue(), row.getDouble(7), 0);
            assertEquals(toDateTime(v), row.getTimestamp(8, 9).toLocalDateTime());
            assertEquals(BigDecimal.valueOf(v), row.getDecimal(9, 5, 0).toBigDecimal());
            assertEquals(BigDecimal.valueOf(v), row.getDecimal(10, 15, 0).toBigDecimal());
            assertEquals(BigDecimal.valueOf(v), row.getDecimal(11, 20, 0).toBigDecimal());
            assertEquals(BigDecimal.valueOf(v), row.getDecimal(12, 5, 0).toBigDecimal());
            assertEquals(BigDecimal.valueOf(v), row.getDecimal(13, 15, 0).toBigDecimal());
            assertEquals(BigDecimal.valueOf(v), row.getDecimal(14, 20, 0).toBigDecimal());
        }
        cnt.incrementAndGet();
    });
    return cnt.get();
}
Also used : FileStatus(org.apache.flink.core.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) LogicalType(org.apache.flink.table.types.logical.LogicalType) BigIntType(org.apache.flink.table.types.logical.BigIntType) TinyIntType(org.apache.flink.table.types.logical.TinyIntType) IntType(org.apache.flink.table.types.logical.IntType) BigIntType(org.apache.flink.table.types.logical.BigIntType) SmallIntType(org.apache.flink.table.types.logical.SmallIntType) FloatType(org.apache.flink.table.types.logical.FloatType) RowData(org.apache.flink.table.data.RowData) CheckpointedPosition(org.apache.flink.connector.file.src.util.CheckpointedPosition) TimestampType(org.apache.flink.table.types.logical.TimestampType) VarCharType(org.apache.flink.table.types.logical.VarCharType) BooleanType(org.apache.flink.table.types.logical.BooleanType) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) TinyIntType(org.apache.flink.table.types.logical.TinyIntType) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SmallIntType(org.apache.flink.table.types.logical.SmallIntType) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) DoubleType(org.apache.flink.table.types.logical.DoubleType) DecimalType(org.apache.flink.table.types.logical.DecimalType) BulkFormat(org.apache.flink.connector.file.src.reader.BulkFormat)

Example 5 with VarCharType

use of org.apache.flink.table.types.logical.VarCharType in project flink by apache.

the class ParquetColumnarRowInputFormatTest method testProjectionReadUnknownField.

@Test
public void testProjectionReadUnknownField() throws IOException {
    int number = 1000;
    List<Row> records = new ArrayList<>(number);
    for (int i = 0; i < number; i++) {
        Integer v = i;
        records.add(newRow(v));
    }
    Path testPath = createTempParquetFile(TEMPORARY_FOLDER.newFolder(), PARQUET_SCHEMA, records, rowGroupSize);
    // test reader
    LogicalType[] fieldTypes = new LogicalType[] { new DoubleType(), new TinyIntType(), new IntType(), new VarCharType() };
    ParquetColumnarRowInputFormat<FileSourceSplit> format = new ParquetColumnarRowInputFormat<>(new Configuration(), // f99 not exist in parquet file.
    RowType.of(fieldTypes, new String[] { "f7", "f2", "f4", "f99" }), null, 500, false, true);
    AtomicInteger cnt = new AtomicInteger(0);
    forEachRemaining(format.createReader(EMPTY_CONF, new FileSourceSplit("id", testPath, 0, Long.MAX_VALUE, 0, Long.MAX_VALUE)), row -> {
        int i = cnt.get();
        assertEquals(i, row.getDouble(0), 0);
        assertEquals((byte) i, row.getByte(1));
        assertEquals(i, row.getInt(2));
        assertTrue(row.isNullAt(3));
        cnt.incrementAndGet();
    });
}
Also used : Path(org.apache.flink.core.fs.Path) PartitionPathUtils.generatePartitionPath(org.apache.flink.table.utils.PartitionPathUtils.generatePartitionPath) FileSourceSplit(org.apache.flink.connector.file.src.FileSourceSplit) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) LogicalType(org.apache.flink.table.types.logical.LogicalType) TinyIntType(org.apache.flink.table.types.logical.TinyIntType) TinyIntType(org.apache.flink.table.types.logical.TinyIntType) IntType(org.apache.flink.table.types.logical.IntType) BigIntType(org.apache.flink.table.types.logical.BigIntType) SmallIntType(org.apache.flink.table.types.logical.SmallIntType) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) DoubleType(org.apache.flink.table.types.logical.DoubleType) Row(org.apache.flink.types.Row) VarCharType(org.apache.flink.table.types.logical.VarCharType) Test(org.junit.Test)

Aggregations

VarCharType (org.apache.flink.table.types.logical.VarCharType)20 RowType (org.apache.flink.table.types.logical.RowType)11 BigIntType (org.apache.flink.table.types.logical.BigIntType)10 IntType (org.apache.flink.table.types.logical.IntType)10 LogicalType (org.apache.flink.table.types.logical.LogicalType)10 TimestampType (org.apache.flink.table.types.logical.TimestampType)10 ArrayList (java.util.ArrayList)7 ArrayType (org.apache.flink.table.types.logical.ArrayType)7 DecimalType (org.apache.flink.table.types.logical.DecimalType)7 DoubleType (org.apache.flink.table.types.logical.DoubleType)7 SmallIntType (org.apache.flink.table.types.logical.SmallIntType)7 TinyIntType (org.apache.flink.table.types.logical.TinyIntType)7 BooleanType (org.apache.flink.table.types.logical.BooleanType)6 FloatType (org.apache.flink.table.types.logical.FloatType)6 DateType (org.apache.flink.table.types.logical.DateType)5 MapType (org.apache.flink.table.types.logical.MapType)5 GenericRowData (org.apache.flink.table.data.GenericRowData)4 LocalZonedTimestampType (org.apache.flink.table.types.logical.LocalZonedTimestampType)4 VarBinaryType (org.apache.flink.table.types.logical.VarBinaryType)4 Configuration (org.apache.hadoop.conf.Configuration)4