Search in sources :

Example 11 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRemoveOrphanFilesAction method testDryRun.

@Test
public void testDryRun() throws IOException, InterruptedException {
    Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), Maps.newHashMap(), tableLocation);
    List<ThreeColumnRecord> records = Lists.newArrayList(new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"));
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);
    df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    List<String> validFiles = spark.read().format("iceberg").load(tableLocation + "#files").select("file_path").as(Encoders.STRING()).collectAsList();
    Assert.assertEquals("Should be 2 valid files", 2, validFiles.size());
    df.write().mode("append").parquet(tableLocation + "/data");
    Path dataPath = new Path(tableLocation + "/data");
    FileSystem fs = dataPath.getFileSystem(spark.sessionState().newHadoopConf());
    List<String> allFiles = Arrays.stream(fs.listStatus(dataPath, HiddenPathFilter.get())).filter(FileStatus::isFile).map(file -> file.getPath().toString()).collect(Collectors.toList());
    Assert.assertEquals("Should be 3 files", 3, allFiles.size());
    List<String> invalidFiles = Lists.newArrayList(allFiles);
    invalidFiles.removeAll(validFiles);
    Assert.assertEquals("Should be 1 invalid file", 1, invalidFiles.size());
    // sleep for 1 second to unsure files will be old enough
    Thread.sleep(1000);
    SparkActions actions = SparkActions.get();
    DeleteOrphanFiles.Result result1 = actions.deleteOrphanFiles(table).deleteWith(s -> {
    }).execute();
    Assert.assertTrue("Default olderThan interval should be safe", Iterables.isEmpty(result1.orphanFileLocations()));
    DeleteOrphanFiles.Result result2 = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).deleteWith(s -> {
    }).execute();
    Assert.assertEquals("Action should find 1 file", invalidFiles, result2.orphanFileLocations());
    Assert.assertTrue("Invalid file should be present", fs.exists(new Path(invalidFiles.get(0))));
    DeleteOrphanFiles.Result result3 = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
    Assert.assertEquals("Action should delete 1 file", invalidFiles, result3.orphanFileLocations());
    Assert.assertFalse("Invalid file should not be present", fs.exists(new Path(invalidFiles.get(0))));
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records);
    expectedRecords.addAll(records);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Path(org.apache.hadoop.fs.Path) Arrays(java.util.Arrays) Types(org.apache.iceberg.types.Types) Dataset(org.apache.spark.sql.Dataset) FileSystem(org.apache.hadoop.fs.FileSystem) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) FileStatus(org.apache.hadoop.fs.FileStatus) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) StreamSupport(java.util.stream.StreamSupport) Namespace(org.apache.iceberg.catalog.Namespace) ExecutorService(java.util.concurrent.ExecutorService) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Before(org.junit.Before) AssertHelpers(org.apache.iceberg.AssertHelpers) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) HadoopCatalog(org.apache.iceberg.hadoop.HadoopCatalog) Table(org.apache.iceberg.Table) HiddenPathFilter(org.apache.iceberg.hadoop.HiddenPathFilter) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) Set(java.util.Set) IOException(java.io.IOException) Iterables(org.apache.iceberg.relocated.com.google.common.collect.Iterables) Test(org.junit.Test) Row(org.apache.spark.sql.Row) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) File(java.io.File) Executors(java.util.concurrent.Executors) Encoders(org.apache.spark.sql.Encoders) ValidationException(org.apache.iceberg.exceptions.ValidationException) Sets(org.apache.iceberg.relocated.com.google.common.collect.Sets) List(java.util.List) Rule(org.junit.Rule) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) Assert(org.junit.Assert) SparkTestBase(org.apache.iceberg.spark.SparkTestBase) TemporaryFolder(org.junit.rules.TemporaryFolder) Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) FileStatus(org.apache.hadoop.fs.FileStatus) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 12 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRemoveOrphanFilesAction method testManyTopLevelPartitions.

@Test
public void testManyTopLevelPartitions() throws InterruptedException {
    Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);
    List<ThreeColumnRecord> records = Lists.newArrayList();
    for (int i = 0; i < 100; i++) {
        records.add(new ThreeColumnRecord(i, String.valueOf(i), String.valueOf(i)));
    }
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
    df.select("c1", "c2", "c3").write().format("iceberg").mode("append").save(tableLocation);
    // sleep for 1 second to unsure files will be old enough
    Thread.sleep(1000);
    SparkActions actions = SparkActions.get();
    DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(System.currentTimeMillis()).execute();
    Assert.assertTrue("Should not delete any files", Iterables.isEmpty(result.orphanFileLocations()));
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", records, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) DeleteOrphanFiles(org.apache.iceberg.actions.DeleteOrphanFiles) Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Test(org.junit.Test)

Example 13 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteDataFilesAction method writeRecords.

private void writeRecords(int files, int numRecords, int partitions) {
    List<ThreeColumnRecord> records = Lists.newArrayList();
    int rowDimension = (int) Math.ceil(Math.sqrt(numRecords));
    List<Pair<Integer, Integer>> data = IntStream.range(0, rowDimension).boxed().flatMap(x -> IntStream.range(0, rowDimension).boxed().map(y -> Pair.of(x, y))).collect(Collectors.toList());
    Collections.shuffle(data, new Random(42));
    if (partitions > 0) {
        data.forEach(i -> records.add(new ThreeColumnRecord(i.first() % partitions, "foo" + i.first(), "bar" + i.second())));
    } else {
        data.forEach(i -> records.add(new ThreeColumnRecord(i.first(), "foo" + i.first(), "bar" + i.second())));
    }
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).repartition(files);
    writeDF(df);
}
Also used : Types(org.apache.iceberg.types.Types) ArgumentMatchers.argThat(org.mockito.ArgumentMatchers.argThat) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) Random(java.util.Random) StructLike(org.apache.iceberg.StructLike) ArgumentMatcher(org.mockito.ArgumentMatcher) Mockito.doThrow(org.mockito.Mockito.doThrow) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) RewriteDataFiles(org.apache.iceberg.actions.RewriteDataFiles) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) PositionDeleteWriter(org.apache.iceberg.deletes.PositionDeleteWriter) Mockito.doAnswer(org.mockito.Mockito.doAnswer) FileScanTask(org.apache.iceberg.FileScanTask) ContentFile(org.apache.iceberg.ContentFile) DataFile(org.apache.iceberg.DataFile) Mockito.doReturn(org.mockito.Mockito.doReturn) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Comparators(org.apache.iceberg.types.Comparators) AssertHelpers(org.apache.iceberg.AssertHelpers) CloseableIterable(org.apache.iceberg.io.CloseableIterable) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) NestedField(org.apache.iceberg.types.Types.NestedField) RowDelta(org.apache.iceberg.RowDelta) Set(java.util.Set) Iterables(org.apache.iceberg.relocated.com.google.common.collect.Iterables) UUID(java.util.UUID) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) SparkTableUtil(org.apache.iceberg.spark.SparkTableUtil) Schema(org.apache.iceberg.Schema) FileRewriteCoordinator(org.apache.iceberg.spark.FileRewriteCoordinator) Collectors(java.util.stream.Collectors) ActionsProvider(org.apache.iceberg.actions.ActionsProvider) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) EncryptionKeyMetadata(org.apache.iceberg.encryption.EncryptionKeyMetadata) Stream(java.util.stream.Stream) EncryptedFiles(org.apache.iceberg.encryption.EncryptedFiles) EncryptedOutputFile(org.apache.iceberg.encryption.EncryptedOutputFile) PartitionSpec(org.apache.iceberg.PartitionSpec) Result(org.apache.iceberg.actions.RewriteDataFiles.Result) TableProperties(org.apache.iceberg.TableProperties) Expressions(org.apache.iceberg.expressions.Expressions) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory) IntStream(java.util.stream.IntStream) ArgumentMatchers.any(org.mockito.ArgumentMatchers.any) Mockito.doCallRealMethod(org.mockito.Mockito.doCallRealMethod) ImmutableSet(org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet) Dataset(org.apache.spark.sql.Dataset) CommitStateUnknownException(org.apache.iceberg.exceptions.CommitStateUnknownException) Pair(org.apache.iceberg.util.Pair) OutputFile(org.apache.iceberg.io.OutputFile) Streams(org.apache.iceberg.relocated.com.google.common.collect.Streams) Mockito.spy(org.mockito.Mockito.spy) RewriteFileGroup(org.apache.iceberg.actions.RewriteFileGroup) BinPackStrategy(org.apache.iceberg.actions.BinPackStrategy) Before(org.junit.Before) SortOrder(org.apache.iceberg.SortOrder) Table(org.apache.iceberg.Table) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) IOException(java.io.IOException) Test(org.junit.Test) MetadataTableType(org.apache.iceberg.MetadataTableType) Row(org.apache.spark.sql.Row) FileFormat(org.apache.iceberg.FileFormat) File(java.io.File) SortStrategy(org.apache.iceberg.actions.SortStrategy) Mockito(org.mockito.Mockito) Record(org.apache.iceberg.data.Record) Conversions(org.apache.iceberg.types.Conversions) RewriteDataFilesCommitManager(org.apache.iceberg.actions.RewriteDataFilesCommitManager) Rule(org.junit.Rule) DeleteFile(org.apache.iceberg.DeleteFile) Comparator(java.util.Comparator) Assert(org.junit.Assert) Collections(java.util.Collections) SparkTestBase(org.apache.iceberg.spark.SparkTestBase) FileScanTaskSetManager(org.apache.iceberg.spark.FileScanTaskSetManager) TemporaryFolder(org.junit.rules.TemporaryFolder) Random(java.util.Random) Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Pair(org.apache.iceberg.util.Pair)

Example 14 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteManifestsAction method testRewriteSmallManifestsPartitionedTable.

@Test
public void testRewriteSmallManifestsPartitionedTable() {
    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c1").truncate("c2", 2).build();
    Map<String, String> options = Maps.newHashMap();
    options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    List<ThreeColumnRecord> records1 = Lists.newArrayList(new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"));
    writeRecords(records1);
    List<ThreeColumnRecord> records2 = Lists.newArrayList(new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD"));
    writeRecords(records2);
    List<ThreeColumnRecord> records3 = Lists.newArrayList(new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF"));
    writeRecords(records3);
    List<ThreeColumnRecord> records4 = Lists.newArrayList(new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH"));
    writeRecords(records4);
    table.refresh();
    List<ManifestFile> manifests = table.currentSnapshot().allManifests();
    Assert.assertEquals("Should have 4 manifests before rewrite", 4, manifests.size());
    SparkActions actions = SparkActions.get();
    // we will expect to have 2 manifests with 4 entries in each after rewrite
    long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests);
    long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes);
    table.updateProperties().set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes)).commit();
    RewriteManifests.Result result = actions.rewriteManifests(table).rewriteIf(manifest -> true).execute();
    Assert.assertEquals("Action should rewrite 4 manifests", 4, Iterables.size(result.rewrittenManifests()));
    Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests()));
    table.refresh();
    List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
    Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size());
    Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount());
    Assert.assertFalse(newManifests.get(0).hasAddedFiles());
    Assert.assertFalse(newManifests.get(0).hasDeletedFiles());
    Assert.assertEquals(4, (long) newManifests.get(1).existingFilesCount());
    Assert.assertFalse(newManifests.get(1).hasAddedFiles());
    Assert.assertFalse(newManifests.get(1).hasDeletedFiles());
    List<ThreeColumnRecord> expectedRecords = Lists.newArrayList();
    expectedRecords.addAll(records1);
    expectedRecords.addAll(records2);
    expectedRecords.addAll(records3);
    expectedRecords.addAll(records4);
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", expectedRecords, actualRecords);
}
Also used : Types(org.apache.iceberg.types.Types) Dataset(org.apache.spark.sql.Dataset) RunWith(org.junit.runner.RunWith) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) ManifestFile(org.apache.iceberg.ManifestFile) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Parameterized(org.junit.runners.Parameterized) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Before(org.junit.Before) RewriteManifests(org.apache.iceberg.actions.RewriteManifests) Table(org.apache.iceberg.Table) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) IOException(java.io.IOException) Iterables(org.apache.iceberg.relocated.com.google.common.collect.Iterables) Test(org.junit.Test) SparkTableUtil(org.apache.iceberg.spark.SparkTableUtil) Row(org.apache.spark.sql.Row) Schema(org.apache.iceberg.Schema) TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) File(java.io.File) Encoders(org.apache.spark.sql.Encoders) List(java.util.List) Rule(org.junit.Rule) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) Assert(org.junit.Assert) SparkTestBase(org.apache.iceberg.spark.SparkTestBase) TemporaryFolder(org.junit.rules.TemporaryFolder) Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) ManifestFile(org.apache.iceberg.ManifestFile) RewriteManifests(org.apache.iceberg.actions.RewriteManifests) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 15 with ThreeColumnRecord

use of org.apache.iceberg.spark.source.ThreeColumnRecord in project iceberg by apache.

the class TestRewriteManifestsAction method testRewriteLargeManifestsPartitionedTable.

@Test
public void testRewriteLargeManifestsPartitionedTable() throws IOException {
    PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("c3").build();
    Map<String, String> options = Maps.newHashMap();
    options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled);
    Table table = TABLES.create(SCHEMA, spec, options, tableLocation);
    // all records belong to the same partition
    List<ThreeColumnRecord> records = Lists.newArrayList();
    for (int i = 0; i < 50; i++) {
        records.add(new ThreeColumnRecord(i, String.valueOf(i), "0"));
    }
    Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class);
    // repartition to create separate files
    writeDF(df.repartition(50, df.col("c1")));
    table.refresh();
    List<ManifestFile> manifests = table.currentSnapshot().allManifests();
    Assert.assertEquals("Should have 1 manifests before rewrite", 1, manifests.size());
    // set the target manifest size to a small value to force splitting records into multiple files
    table.updateProperties().set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(manifests.get(0).length() / 2)).commit();
    SparkActions actions = SparkActions.get();
    RewriteManifests.Result result = actions.rewriteManifests(table).rewriteIf(manifest -> true).stagingLocation(temp.newFolder().toString()).execute();
    Assert.assertEquals("Action should rewrite 1 manifest", 1, Iterables.size(result.rewrittenManifests()));
    Assert.assertEquals("Action should add 2 manifests", 2, Iterables.size(result.addedManifests()));
    table.refresh();
    List<ManifestFile> newManifests = table.currentSnapshot().allManifests();
    Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size());
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation);
    List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", records, actualRecords);
}
Also used : Table(org.apache.iceberg.Table) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) PartitionSpec(org.apache.iceberg.PartitionSpec) ManifestFile(org.apache.iceberg.ManifestFile) RewriteManifests(org.apache.iceberg.actions.RewriteManifests) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Aggregations

ThreeColumnRecord (org.apache.iceberg.spark.source.ThreeColumnRecord)34 Row (org.apache.spark.sql.Row)33 Test (org.junit.Test)32 Table (org.apache.iceberg.Table)26 PartitionSpec (org.apache.iceberg.PartitionSpec)18 DeleteOrphanFiles (org.apache.iceberg.actions.DeleteOrphanFiles)12 File (java.io.File)11 Configuration (org.apache.hadoop.conf.Configuration)11 List (java.util.List)10 Map (java.util.Map)10 Schema (org.apache.iceberg.Schema)10 Snapshot (org.apache.iceberg.Snapshot)10 TableProperties (org.apache.iceberg.TableProperties)10 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)10 Lists (org.apache.iceberg.relocated.com.google.common.collect.Lists)10 Maps (org.apache.iceberg.relocated.com.google.common.collect.Maps)10 SparkTestBase (org.apache.iceberg.spark.SparkTestBase)10 Types (org.apache.iceberg.types.Types)10 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)10 Dataset (org.apache.spark.sql.Dataset)10