use of org.apache.iceberg.StructLike in project incubator-gobblin by apache.
the class IcebergMetadataWriter method getIcebergDataFilesToBeAdded.
/**
* Method to get dataFiles with metrics information
* This method is used to get files to be added to iceberg
* if completeness is enabled a new field (late) is added to table schema and partition spec
* computed based on datepartition and completion watermark
* This method will call method {IcebergUtils.getIcebergDataFileWithMetric} to get DataFile for specific file path
*/
private Set<DataFile> getIcebergDataFilesToBeAdded(Table table, TableMetadata tableMetadata, GobblinMetadataChangeEvent gmce, List<org.apache.gobblin.metadata.DataFile> files, PartitionSpec partitionSpec, Map<String, Collection<HiveSpec>> newSpecsMap, Map<Integer, Integer> schemaIdMap) {
Set<DataFile> dataFiles = new HashSet<>();
for (org.apache.gobblin.metadata.DataFile file : files) {
try {
Collection<HiveSpec> hiveSpecs = newSpecsMap.get(new Path(file.getFilePath()).getParent().toString());
StructLike partition = getIcebergPartitionVal(hiveSpecs, file.getFilePath(), partitionSpec);
if (tableMetadata.newPartitionColumnEnabled && gmce.getOperationType() == OperationType.add_files) {
tableMetadata.prevCompletenessWatermark = Long.parseLong(table.properties().getOrDefault(COMPLETION_WATERMARK_KEY, String.valueOf(DEFAULT_COMPLETION_WATERMARK)));
// Assumes first partition value to be partitioned by date
// TODO Find better way to determine a partition value
String datepartition = partition.get(0, null);
partition = addLatePartitionValueToIcebergTable(table, tableMetadata, hiveSpecs.iterator().next().getPartition().get(), datepartition);
tableMetadata.datePartitions.add(getDateTimeFromDatepartitionString(datepartition));
}
dataFiles.add(IcebergUtils.getIcebergDataFileWithMetric(file, table.spec(), partition, conf, schemaIdMap));
} catch (Exception e) {
log.warn("Cannot get DataFile for {} dur to {}", file.getFilePath(), e);
}
}
return dataFiles;
}
use of org.apache.iceberg.StructLike in project incubator-gobblin by apache.
the class IcebergMetadataWriter method getIcebergDataFilesToBeDeleted.
/**
* Method to get a {@link DataFile} collection without metrics information
* This method is used to get files to be deleted from iceberg
* If oldFilePrefixes is specified in gmce, this method will use those prefixes to find old file in iceberg,
* or the method will call method {IcebergUtils.getIcebergDataFileWithMetric} to get DataFile for specific file path
*/
private Set<DataFile> getIcebergDataFilesToBeDeleted(GobblinMetadataChangeEvent gmce, Table table, Map<String, Collection<HiveSpec>> newSpecsMap, Map<String, Collection<HiveSpec>> oldSpecsMap, PartitionSpec partitionSpec) throws IOException {
Set<DataFile> oldDataFiles = new HashSet<>();
if (gmce.getOldFilePrefixes() != null) {
Expression exp = Expressions.alwaysFalse();
for (String prefix : gmce.getOldFilePrefixes()) {
// Use both full path and raw path to filter old files
exp = Expressions.or(exp, Expressions.startsWith(ICEBERG_FILE_PATH_COLUMN, prefix));
String rawPathPrefix = new Path(prefix).toUri().getRawPath();
exp = Expressions.or(exp, Expressions.startsWith(ICEBERG_FILE_PATH_COLUMN, rawPathPrefix));
}
long start = System.currentTimeMillis();
oldDataFiles.addAll(Sets.newHashSet(FindFiles.in(table).withMetadataMatching(exp).collect().iterator()));
// Use INFO level log here to get better estimate.
// This shouldn't overwhelm the log since we receive limited number of rewrite_file gmces for one table in a day
log.info("Spent {}ms to query all old files in iceberg.", System.currentTimeMillis() - start);
} else {
for (String file : gmce.getOldFiles()) {
String specPath = new Path(file).getParent().toString();
// For the use case of recompaction, the old path may contains /daily path, in this case, we find the spec from newSpecsMap
StructLike partitionVal = getIcebergPartitionVal(oldSpecsMap.containsKey(specPath) ? oldSpecsMap.get(specPath) : newSpecsMap.get(specPath), file, partitionSpec);
oldDataFiles.add(IcebergUtils.getIcebergDataFileWithoutMetric(file, partitionSpec, partitionVal));
}
}
return oldDataFiles;
}
use of org.apache.iceberg.StructLike in project hive by apache.
the class TestHiveIcebergStorageHandlerLocalScan method testCreateTableWithColumnSpecification.
@Test
public void testCreateTableWithColumnSpecification() throws IOException {
TableIdentifier identifier = TableIdentifier.of("default", "customers");
Map<StructLike, List<Record>> data = new HashMap<>(1);
data.put(null, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
String createSql = "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT, first_name STRING COMMENT 'This is first name', " + "last_name STRING COMMENT 'This is last name')" + " STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + testTables.propertiesForCreateTableSQL(ImmutableMap.of());
runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, PartitionSpec.unpartitioned(), data);
}
use of org.apache.iceberg.StructLike in project hive by apache.
the class TestHiveIcebergStorageHandlerLocalScan method testCreatePartitionedTableByProperty.
@Test
public void testCreatePartitionedTableByProperty() throws IOException {
TableIdentifier identifier = TableIdentifier.of("default", "customers");
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("last_name").build();
Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
String createSql = "CREATE EXTERNAL TABLE " + identifier + " STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + "TBLPROPERTIES ('" + InputFormatConfig.PARTITION_SPEC + "'='" + PartitionSpecParser.toJson(spec) + "', " + "'" + InputFormatConfig.TABLE_SCHEMA + "'='" + SchemaParser.toJson(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA) + "', " + "'" + InputFormatConfig.CATALOG_NAME + "'='" + testTables.catalogName() + "')";
runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
use of org.apache.iceberg.StructLike in project hive by apache.
the class TestHiveIcebergStorageHandlerLocalScan method testCreateTableWithColumnSpecificationMultilevelPartitioned.
@Test
public void testCreateTableWithColumnSpecificationMultilevelPartitioned() throws IOException {
TableIdentifier identifier = TableIdentifier.of("default", "customers");
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("first_name").identity("last_name").build();
Map<StructLike, List<Record>> data = ImmutableMap.of(Row.of("Alice", "Brown"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(0)), Row.of("Bob", "Green"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(1)), Row.of("Trudy", "Pink"), Collections.singletonList(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
String createSql = "CREATE EXTERNAL TABLE " + identifier + " (customer_id BIGINT) " + "PARTITIONED BY (first_name STRING COMMENT 'This is first name', " + "last_name STRING COMMENT 'This is last name') " + "STORED BY ICEBERG " + testTables.locationForCreateTableSQL(identifier) + testTables.propertiesForCreateTableSQL(ImmutableMap.of());
runCreateAndReadTest(identifier, createSql, HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, data);
}
Aggregations