Search in sources :

Example 6 with StructLike

use of org.apache.iceberg.StructLike in project incubator-gobblin by apache.

the class IcebergMetadataWriter method getIcebergDataFilesToBeAdded.

/**
 * Method to get dataFiles with metrics information
 * This method is used to get files to be added to iceberg
 * if completeness is enabled a new field (late) is added to table schema and partition spec
 * computed based on datepartition and completion watermark
 * This method will call method {IcebergUtils.getIcebergDataFileWithMetric} to get DataFile for specific file path
 */
private Set<DataFile> getIcebergDataFilesToBeAdded(Table table, TableMetadata tableMetadata, GobblinMetadataChangeEvent gmce, List<org.apache.gobblin.metadata.DataFile> files, PartitionSpec partitionSpec, Map<String, Collection<HiveSpec>> newSpecsMap, Map<Integer, Integer> schemaIdMap) {
    Set<DataFile> dataFiles = new HashSet<>();
    for (org.apache.gobblin.metadata.DataFile file : files) {
        try {
            Collection<HiveSpec> hiveSpecs = newSpecsMap.get(new Path(file.getFilePath()).getParent().toString());
            StructLike partition = getIcebergPartitionVal(hiveSpecs, file.getFilePath(), partitionSpec);
            if (tableMetadata.newPartitionColumnEnabled && gmce.getOperationType() == OperationType.add_files) {
                tableMetadata.prevCompletenessWatermark = Long.parseLong(table.properties().getOrDefault(COMPLETION_WATERMARK_KEY, String.valueOf(DEFAULT_COMPLETION_WATERMARK)));
                // Assumes first partition value to be partitioned by date
                // TODO Find better way to determine a partition value
                String datepartition = partition.get(0, null);
                partition = addLatePartitionValueToIcebergTable(table, tableMetadata, hiveSpecs.iterator().next().getPartition().get(), datepartition);
                tableMetadata.datePartitions.add(getDateTimeFromDatepartitionString(datepartition));
            }
            dataFiles.add(IcebergUtils.getIcebergDataFileWithMetric(file, table.spec(), partition, conf, schemaIdMap));
        } catch (Exception e) {
            log.warn("Cannot get DataFile for {} dur to {}", file.getFilePath(), e);
        }
    }
    return dataFiles;
}
Also used : DataFile(org.apache.iceberg.DataFile) Path(org.apache.hadoop.fs.Path) StructLike(org.apache.iceberg.StructLike) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec) AlreadyExistsException(org.apache.iceberg.exceptions.AlreadyExistsException) SchemaRegistryException(org.apache.gobblin.metrics.kafka.SchemaRegistryException) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) IOException(java.io.IOException) HashSet(java.util.HashSet)

Example 7 with StructLike

use of org.apache.iceberg.StructLike in project incubator-gobblin by apache.

the class IcebergMetadataWriter method getIcebergDataFilesToBeDeleted.

/**
 * Method to get a {@link DataFile} collection without metrics information
 * This method is used to get files to be deleted from iceberg
 * If oldFilePrefixes is specified in gmce, this method will use those prefixes to find old file in iceberg,
 * or the method will call method {IcebergUtils.getIcebergDataFileWithMetric} to get DataFile for specific file path
 */
private Set<DataFile> getIcebergDataFilesToBeDeleted(GobblinMetadataChangeEvent gmce, Table table, Map<String, Collection<HiveSpec>> newSpecsMap, Map<String, Collection<HiveSpec>> oldSpecsMap, PartitionSpec partitionSpec) throws IOException {
    Set<DataFile> oldDataFiles = new HashSet<>();
    if (gmce.getOldFilePrefixes() != null) {
        Expression exp = Expressions.alwaysFalse();
        for (String prefix : gmce.getOldFilePrefixes()) {
            // Use both full path and raw path to filter old files
            exp = Expressions.or(exp, Expressions.startsWith(ICEBERG_FILE_PATH_COLUMN, prefix));
            String rawPathPrefix = new Path(prefix).toUri().getRawPath();
            exp = Expressions.or(exp, Expressions.startsWith(ICEBERG_FILE_PATH_COLUMN, rawPathPrefix));
        }
        long start = System.currentTimeMillis();
        oldDataFiles.addAll(Sets.newHashSet(FindFiles.in(table).withMetadataMatching(exp).collect().iterator()));
        // Use INFO level log here to get better estimate.
        // This shouldn't overwhelm the log since we receive limited number of rewrite_file gmces for one table in a day
        log.info("Spent {}ms to query all old files in iceberg.", System.currentTimeMillis() - start);
    } else {
        for (String file : gmce.getOldFiles()) {
            String specPath = new Path(file).getParent().toString();
            // For the use case of recompaction, the old path may contains /daily path, in this case, we find the spec from newSpecsMap
            StructLike partitionVal = getIcebergPartitionVal(oldSpecsMap.containsKey(specPath) ? oldSpecsMap.get(specPath) : newSpecsMap.get(specPath), file, partitionSpec);
            oldDataFiles.add(IcebergUtils.getIcebergDataFileWithoutMetric(file, partitionSpec, partitionVal));
        }
    }
    return oldDataFiles;
}
Also used : DataFile(org.apache.iceberg.DataFile) Path(org.apache.hadoop.fs.Path) Expression(org.apache.iceberg.expressions.Expression) StructLike(org.apache.iceberg.StructLike) HashSet(java.util.HashSet)

Example 8 with StructLike

use of org.apache.iceberg.StructLike in project hive by apache.

the class TestHiveIcebergStorageHandlerLocalScan method testCreateTableWithColumnSpecification.

@Test
public void testCreateTableWithColumnSpecification() throws IOException {
    TableIdentifier identifier = TableIdentifier.of("default", "customers");
    Map<StructLike, List<Record>> data = new HashMap<>(1);
    data.put(null, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
    String createSql = "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT, first_name STRING COMMENT 'This is first name', " + "last_name STRING COMMENT 'This is last name')" + " STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + testTables.propertiesForCreateTableSQL(ImmutableMap.of());
    runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, PartitionSpec.unpartitioned(), data);
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) List(java.util.List) StructLike(org.apache.iceberg.StructLike) Test(org.junit.Test)

Example 9 with StructLike

use of org.apache.iceberg.StructLike in project hive by apache.

the class TestHiveIcebergStorageHandlerLocalScan method testCreatePartitionedTableByProperty.

@Test
public void testCreatePartitionedTableByProperty() throws IOException {
    TableIdentifier identifier = TableIdentifier.of("default", "customers");
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("last_name").build();
    Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
    String createSql = "CREATE EXTERNAL TABLE " + identifier + " STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + "TBLPROPERTIES ('" + InputFormatConfig.PARTITION_SPEC + "'='" + PartitionSpecParser.toJson(spec) + "', " + "'" + InputFormatConfig.TABLE_SCHEMA + "'='" + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "', " + "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')";
    runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) ArrayList(java.util.ArrayList) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) List(java.util.List) StructLike(org.apache.iceberg.StructLike) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 10 with StructLike

use of org.apache.iceberg.StructLike in project hive by apache.

the class TestHiveIcebergStorageHandlerLocalScan method testCreateTableWithColumnSpecificationMultilevelPartitioned.

@Test
public void testCreateTableWithColumnSpecificationMultilevelPartitioned() throws IOException {
    TableIdentifier identifier = TableIdentifier.of("default", "customers");
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("first_name").identity("last_name").build();
    Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Alice", "Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Bob", "Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Trudy", "Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
    String createSql = "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT) " + "PARTITIONED BY (first_name STRING COMMENT 'This is first name', " + "last_name STRING COMMENT 'This is last name') " + "STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + testTables.propertiesForCreateTableSQL(ImmutableMap.of());
    runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) ArrayList(java.util.ArrayList) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) List(java.util.List) StructLike(org.apache.iceberg.StructLike) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Aggregations

StructLike (org.apache.iceberg.StructLike)11 List (java.util.List)5 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 PartitionSpec (org.apache.iceberg.PartitionSpec)4 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)4 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)4 Test (org.junit.Test)4 IOException (java.io.IOException)3 DataFile (org.apache.iceberg.DataFile)3 HashSet (java.util.HashSet)2 Path (org.apache.hadoop.fs.Path)2 Record (org.apache.iceberg.data.Record)2 Type (org.apache.iceberg.types.Type)2 Types (org.apache.iceberg.types.Types)2 UncheckedIOException (java.io.UncheckedIOException)1 Array (java.lang.reflect.Array)1 ByteBuffer (java.nio.ByteBuffer)1 Arrays (java.util.Arrays)1 Map (java.util.Map)1