Search in sources :

Example 46 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergPartitions method testBucketTransform.

@Test
public void testBucketTransform() throws IOException {
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "part_field", Types.StringType.get()));
    PartitionSpec spec = PartitionSpec.builderFor(schema).bucket("part_field", 2).build();
    List<Record> records = TestHelper.RecordsBuilder.newInstance(schema).add(1L, "Part1").add(2L, "Part2").add(3L, "Art3").build();
    Table table = testTables.createTable(shell, "part_test", schema, spec, fileFormat, records);
    HiveIcebergTestUtils.validateData(table, records, 0);
    HiveIcebergTestUtils.validateDataWithSQL(shell, "part_test", records, "id");
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 47 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergSelects method testMultiColumnPruning.

/**
 * Column pruning could become problematic when a single Map Task contains multiple TableScan operators where
 * different columns are pruned. This only occurs on MR, as Tez initializes a single Map task for every TableScan
 * operator.
 */
@Test
public void testMultiColumnPruning() throws IOException {
    shell.setHiveSessionValue("hive.cbo.enable", true);
    Schema schema1 = new Schema(optional(1, "fk", Types.StringType.get()));
    List<Record> records1 = TestHelper.RecordsBuilder.newInstance(schema1).add("fk1").build();
    testTables.createTable(shell, "table1", schema1, fileFormat, records1);
    Schema schema2 = new Schema(optional(1, "fk", Types.StringType.get()), optional(2, "val", Types.StringType.get()));
    List<Record> records2 = TestHelper.RecordsBuilder.newInstance(schema2).add("fk1", "val").build();
    testTables.createTable(shell, "table2", schema2, fileFormat, records2);
    // MR is needed for the reproduction
    shell.setHiveSessionValue("hive.execution.engine", "mr");
    String query = "SELECT t2.val FROM table1 t1 JOIN table2 t2 ON t1.fk = t2.fk";
    List<Object[]> result = shell.executeStatement(query);
    Assert.assertEquals(1, result.size());
    Assert.assertArrayEquals(new Object[] { "val" }, result.get(0));
}
Also used : Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 48 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergSelects method testSelectDistinctFromTable.

@Test
public void testSelectDistinctFromTable() throws IOException {
    for (int i = 0; i < SUPPORTED_TYPES.size(); i++) {
        Type type = SUPPORTED_TYPES.get(i);
        if ((type == Types.TimestampType.withZone() || type == Types.TimeType.get()) && isVectorized && fileFormat == FileFormat.ORC) {
            // ORC/TIMESTAMP_INSTANT and time are not supported vectorized types for Hive
            continue;
        }
        // TODO: remove this filter when issue #1881 is resolved
        if (type == Types.UUIDType.get() && fileFormat == FileFormat.PARQUET) {
            continue;
        }
        String tableName = type.typeId().toString().toLowerCase() + "_table_" + i;
        String columnName = type.typeId().toString().toLowerCase() + "_column";
        Schema schema = new Schema(required(1, columnName, type));
        List<Record> records = TestHelper.generateRandomRecords(schema, 4, 0L);
        int size = records.stream().map(r -> r.getField(columnName)).collect(Collectors.toSet()).size();
        testTables.createTable(shell, tableName, schema, fileFormat, records);
        List<Object[]> queryResult = shell.executeStatement("select count(distinct(" + columnName + ")) from default." + tableName);
        int distinctIds = ((Long) queryResult.get(0)[0]).intValue();
        Assert.assertEquals(tableName, size, distinctIds);
    }
}
Also used : Type(org.apache.iceberg.types.Type) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 49 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergSelects method testJoinTablesSupportedTypes.

@Test
public void testJoinTablesSupportedTypes() throws IOException {
    for (int i = 0; i < SUPPORTED_TYPES.size(); i++) {
        Type type = SUPPORTED_TYPES.get(i);
        if ((type == Types.TimestampType.withZone() || type == Types.TimeType.get()) && isVectorized && fileFormat == FileFormat.ORC) {
            // ORC/TIMESTAMP_INSTANT and time are not supported vectorized types for Hive
            continue;
        }
        // TODO: remove this filter when issue #1881 is resolved
        if (type == Types.UUIDType.get() && fileFormat == FileFormat.PARQUET) {
            continue;
        }
        String tableName = type.typeId().toString().toLowerCase() + "_table_" + i;
        String columnName = type.typeId().toString().toLowerCase() + "_column";
        Schema schema = new Schema(required(1, columnName, type));
        List<Record> records = TestHelper.generateRandomRecords(schema, 1, 0L);
        testTables.createTable(shell, tableName, schema, fileFormat, records);
        List<Object[]> queryResult = shell.executeStatement("select s." + columnName + ", h." + columnName + " from default." + tableName + " s join default." + tableName + " h on h." + columnName + "=s." + columnName);
        Assert.assertEquals("Non matching record count for table " + tableName + " with type " + type, 1, queryResult.size());
    }
}
Also used : Type(org.apache.iceberg.types.Type) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 50 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class HiveIcebergTestUtils method validateDataWithSQL.

/**
 * Validates whether the table contains the expected records. The records are retrieved by Hive query and compared as
 * strings. The results should be sorted by a unique key so we do not end up with flaky tests.
 * @param shell Shell to execute the query
 * @param tableName The table to query
 * @param expected The expected list of Records
 * @param sortBy The column name by which we will sort
 */
public static void validateDataWithSQL(TestHiveShell shell, String tableName, List<Record> expected, String sortBy) {
    List<Object[]> rows = shell.executeStatement("SELECT * FROM " + tableName + " ORDER BY " + sortBy);
    Assert.assertEquals(expected.size(), rows.size());
    for (int i = 0; i < expected.size(); ++i) {
        Object[] row = rows.get(i);
        Record record = expected.get(i);
        Assert.assertEquals(record.size(), row.length);
        for (int j = 0; j < record.size(); ++j) {
            Object field = record.get(j);
            if (field instanceof LocalDateTime) {
                Assert.assertEquals(((LocalDateTime) field).toInstant(ZoneOffset.UTC).toEpochMilli(), TimestampUtils.stringToTimestamp((String) row[j]).toEpochMilli());
            } else if (field instanceof OffsetDateTime) {
                Assert.assertEquals(((OffsetDateTime) field).toInstant().toEpochMilli(), TimestampTZUtil.parse((String) row[j]).toEpochMilli());
            } else {
                Assert.assertEquals(field.toString(), row[j].toString());
            }
        }
    }
}
Also used : LocalDateTime(java.time.LocalDateTime) OffsetDateTime(java.time.OffsetDateTime) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record)

Aggregations

Record (org.apache.iceberg.data.Record)114 Test (org.junit.Test)99 Schema (org.apache.iceberg.Schema)68 Table (org.apache.iceberg.Table)51 GenericRecord (org.apache.iceberg.data.GenericRecord)51 PartitionSpec (org.apache.iceberg.PartitionSpec)19 ArrayList (java.util.ArrayList)14 List (java.util.List)13 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)12 HashMap (java.util.HashMap)11 IcebergBaseTest (org.apache.drill.metastore.iceberg.IcebergBaseTest)11 TestHelper (org.apache.iceberg.mr.TestHelper)11 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)10 Types (org.apache.iceberg.types.Types)10 Map (java.util.Map)9 IOException (java.io.IOException)8 ImmutableMap (org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap)8 FileFormat (org.apache.iceberg.FileFormat)7 DeleteFile (org.apache.iceberg.DeleteFile)6 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)6