Search in sources :

Example 1 with ContentFile

use of org.apache.iceberg.ContentFile in project iceberg by apache.

the class TestRewriteDataFilesAction method testBinPackWithDeletes.

@Test
public void testBinPackWithDeletes() throws Exception {
    Table table = createTablePartitioned(4, 2);
    table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit();
    shouldHaveFiles(table, 8);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum();
    RowDelta rowDelta = table.newRowDelta();
    // add 1 delete file for data files 0, 1, 2
    for (int i = 0; i < 3; i++) {
        writePosDeletesToFile(table, dataFiles.get(i), 1).forEach(rowDelta::addDeletes);
    }
    // add 2 delete files for data files 3, 4
    for (int i = 3; i < 5; i++) {
        writePosDeletesToFile(table, dataFiles.get(i), 2).forEach(rowDelta::addDeletes);
    }
    rowDelta.commit();
    table.refresh();
    List<Object[]> expectedRecords = currentData();
    Result result = actions().rewriteDataFiles(table).option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0").option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)).option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)).option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2").execute();
    Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount());
    List<Object[]> actualRecords = currentData();
    assertEquals("Rows must match", expectedRecords, actualRecords);
    Assert.assertEquals("7 rows are removed", total - 7, actualRecords.size());
}
Also used : DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) ContentFile(org.apache.iceberg.ContentFile) FileScanTask(org.apache.iceberg.FileScanTask) RowDelta(org.apache.iceberg.RowDelta) Result(org.apache.iceberg.actions.RewriteDataFiles.Result) Test(org.junit.Test)

Example 2 with ContentFile

use of org.apache.iceberg.ContentFile in project hive by apache.

the class HiveIcebergOutputCommitter method abortJob.

/**
 * Removes the generated data files if there is a commit file already generated for them.
 * The cleanup at the end removes the temporary directories as well.
 * @param originalContext The job context
 * @param status The status of the job
 * @throws IOException if there is a failure deleting the files
 */
@Override
public void abortJob(JobContext originalContext, int status) throws IOException {
    JobContext jobContext = TezUtil.enrichContextWithVertexId(originalContext);
    JobConf jobConf = jobContext.getJobConf();
    LOG.info("Job {} is aborted. Data file cleaning started", jobContext.getJobID());
    Collection<String> outputs = HiveIcebergStorageHandler.outputTables(jobContext.getJobConf());
    Collection<String> jobLocations = new ConcurrentLinkedQueue<>();
    ExecutorService fileExecutor = fileExecutor(jobConf);
    ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size());
    try {
        // Cleans up the changes for the output tables in parallel
        Tasks.foreach(outputs).suppressFailureWhenFinished().executeWith(tableExecutor).onFailure((output, exc) -> LOG.warn("Failed cleanup table {} on abort job", output, exc)).run(output -> {
            LOG.info("Cleaning job for jobID: {}, table: {}", jobContext.getJobID(), output);
            Table table = HiveIcebergStorageHandler.table(jobConf, output);
            String jobLocation = generateJobLocation(table.location(), jobConf, jobContext.getJobID());
            jobLocations.add(jobLocation);
            // list jobLocation to get number of forCommit files
            // we do this because map/reduce num in jobConf is unreliable and we have no access to vertex status info
            int numTasks = listForCommits(jobConf, jobLocation).size();
            FilesForCommit results = collectResults(numTasks, fileExecutor, table.location(), jobContext, table.io(), false);
            // Check if we have files already written and remove data and delta files if there are any
            Collection<ContentFile> files = Stream.concat(results.dataFiles().stream(), results.deleteFiles().stream()).collect(Collectors.toList());
            if (files.size() > 0) {
                Tasks.foreach(files).retry(3).suppressFailureWhenFinished().executeWith(fileExecutor).onFailure((file, exc) -> LOG.warn("Failed to remove data file {} on abort job", file.path(), exc)).run(file -> table.io().deleteFile(file.path().toString()));
            }
        }, IOException.class);
    } finally {
        fileExecutor.shutdown();
        if (tableExecutor != null) {
            tableExecutor.shutdown();
        }
    }
    LOG.info("Job {} is aborted. Data file cleaning finished", jobContext.getJobID());
    cleanup(jobContext, jobLocations);
}
Also used : Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Catalogs(org.apache.iceberg.mr.Catalogs) ObjectInputStream(java.io.ObjectInputStream) LoggerFactory(org.slf4j.LoggerFactory) AppendFiles(org.apache.iceberg.AppendFiles) FileStatus(org.apache.hadoop.fs.FileStatus) DeleteFiles(org.apache.iceberg.DeleteFiles) TaskType(org.apache.hadoop.mapreduce.TaskType) OutputCommitter(org.apache.hadoop.mapred.OutputCommitter) WriterRegistry(org.apache.iceberg.mr.hive.writer.WriterRegistry) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) ContentFile(org.apache.iceberg.ContentFile) DataFile(org.apache.iceberg.DataFile) Collection(java.util.Collection) RowDelta(org.apache.iceberg.RowDelta) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) Set(java.util.Set) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) Util(org.apache.iceberg.hadoop.Util) ReplacePartitions(org.apache.iceberg.ReplacePartitions) Stream(java.util.stream.Stream) HiveIcebergWriter(org.apache.iceberg.mr.hive.writer.HiveIcebergWriter) Optional(java.util.Optional) SessionStateUtil(org.apache.hadoop.hive.ql.session.SessionStateUtil) Expressions(org.apache.iceberg.expressions.Expressions) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) NotFoundException(org.apache.iceberg.exceptions.NotFoundException) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) OutputFile(org.apache.iceberg.io.OutputFile) TaskAttemptContext(org.apache.hadoop.mapred.TaskAttemptContext) ObjectOutputStream(java.io.ObjectOutputStream) JobID(org.apache.hadoop.mapreduce.JobID) ExecutorService(java.util.concurrent.ExecutorService) Properties(java.util.Properties) Logger(org.slf4j.Logger) Table(org.apache.iceberg.Table) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) IOException(java.io.IOException) JobConf(org.apache.hadoop.mapred.JobConf) JobContext(org.apache.hadoop.mapred.JobContext) Tasks(org.apache.iceberg.util.Tasks) Preconditions(org.apache.iceberg.relocated.com.google.common.base.Preconditions) DeleteFile(org.apache.iceberg.DeleteFile) FileIO(org.apache.iceberg.io.FileIO) VisibleForTesting(org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting) Table(org.apache.iceberg.Table) ContentFile(org.apache.iceberg.ContentFile) ExecutorService(java.util.concurrent.ExecutorService) JobContext(org.apache.hadoop.mapred.JobContext) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) JobConf(org.apache.hadoop.mapred.JobConf)

Example 3 with ContentFile

use of org.apache.iceberg.ContentFile in project iceberg by apache.

the class TestRewriteDataFilesAction method testRewriteAvoidRepeateCompress.

/**
 * a test case to test avoid repeate compress
 * <p>
 * If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the CombinedScanTask list size
 * is 1, so we remove these CombinedScanTasks to avoid compressed repeatedly.
 * <p>
 * In this test case,we generated 3 data files and set targetSizeInBytes greater than the largest file size so that it
 * cannot be  combined a CombinedScanTask with other datafiles. The datafile with the largest file size will not be
 * compressed.
 *
 * @throws IOException IOException
 */
@Test
public void testRewriteAvoidRepeateCompress() throws IOException {
    Assume.assumeFalse("ORC does not support getting length when file is opening", format.equals(FileFormat.ORC));
    List<Record> expected = Lists.newArrayList();
    Schema schema = icebergTableUnPartitioned.schema();
    GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema);
    File file = temp.newFile();
    int count = 0;
    try (FileAppender<Record> fileAppender = genericAppenderFactory.newAppender(Files.localOutput(file), format)) {
        long filesize = 20000;
        for (; fileAppender.length() < filesize; count++) {
            Record record = SimpleDataUtil.createRecord(count, "iceberg");
            fileAppender.add(record);
            expected.add(record);
        }
    }
    DataFile dataFile = DataFiles.builder(icebergTableUnPartitioned.spec()).withPath(file.getAbsolutePath()).withFileSizeInBytes(file.length()).withFormat(format).withRecordCount(count).build();
    icebergTableUnPartitioned.newAppend().appendFile(dataFile).commit();
    sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARTITIONED);
    sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARTITIONED);
    icebergTableUnPartitioned.refresh();
    CloseableIterable<FileScanTask> tasks = icebergTableUnPartitioned.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    Assert.assertEquals("Should have 3 data files before rewrite", 3, dataFiles.size());
    Actions actions = Actions.forTable(icebergTableUnPartitioned);
    long targetSizeInBytes = file.length() + 10;
    RewriteDataFilesActionResult result = actions.rewriteDataFiles().targetSizeInBytes(targetSizeInBytes).splitOpenFileCost(1).execute();
    Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
    Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
    icebergTableUnPartitioned.refresh();
    CloseableIterable<FileScanTask> tasks1 = icebergTableUnPartitioned.newScan().planFiles();
    List<DataFile> dataFilesRewrote = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
    Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFilesRewrote.size());
    // the biggest file do not be rewrote
    List rewroteDataFileNames = dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList());
    Assert.assertTrue(rewroteDataFileNames.contains(file.getAbsolutePath()));
    // Assert the table records as expected.
    expected.add(SimpleDataUtil.createRecord(1, "a"));
    expected.add(SimpleDataUtil.createRecord(2, "b"));
    SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected);
}
Also used : Schema(org.apache.iceberg.Schema) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory) DataFile(org.apache.iceberg.DataFile) RewriteDataFilesActionResult(org.apache.iceberg.actions.RewriteDataFilesActionResult) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) List(java.util.List) FileScanTask(org.apache.iceberg.FileScanTask) ContentFile(org.apache.iceberg.ContentFile) DataFile(org.apache.iceberg.DataFile) File(java.io.File) Test(org.junit.Test)

Example 4 with ContentFile

use of org.apache.iceberg.ContentFile in project iceberg by apache.

the class TestRewriteDataFilesAction method testBinPackWithDeleteAllData.

@Test
public void testBinPackWithDeleteAllData() {
    Map<String, String> options = Maps.newHashMap();
    options.put(TableProperties.FORMAT_VERSION, "2");
    Table table = createTablePartitioned(1, 1, 1, options);
    shouldHaveFiles(table, 1);
    table.refresh();
    CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
    List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
    int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum();
    RowDelta rowDelta = table.newRowDelta();
    // remove all data
    writePosDeletesToFile(table, dataFiles.get(0), total).forEach(rowDelta::addDeletes);
    rowDelta.commit();
    table.refresh();
    List<Object[]> expectedRecords = currentData();
    Result result = actions().rewriteDataFiles(table).option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1").execute();
    Assert.assertEquals("Action should rewrite 1 data files", 1, result.rewrittenDataFilesCount());
    List<Object[]> actualRecords = currentData();
    assertEquals("Rows must match", expectedRecords, actualRecords);
    Assert.assertEquals("Data manifest should not have existing data file", 0, (long) table.currentSnapshot().dataManifests().get(0).existingFilesCount());
    Assert.assertEquals("Data manifest should have 1 delete data file", 1L, (long) table.currentSnapshot().dataManifests().get(0).deletedFilesCount());
    Assert.assertEquals("Delete manifest added row count should equal total count", total, (long) table.currentSnapshot().deleteManifests().get(0).addedRowsCount());
}
Also used : Table(org.apache.iceberg.Table) ContentFile(org.apache.iceberg.ContentFile) RowDelta(org.apache.iceberg.RowDelta) Result(org.apache.iceberg.actions.RewriteDataFiles.Result) DataFile(org.apache.iceberg.DataFile) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Aggregations

ContentFile (org.apache.iceberg.ContentFile)4 DataFile (org.apache.iceberg.DataFile)4 FileScanTask (org.apache.iceberg.FileScanTask)3 RowDelta (org.apache.iceberg.RowDelta)3 Table (org.apache.iceberg.Table)3 Test (org.junit.Test)3 File (java.io.File)1 IOException (java.io.IOException)1 ObjectInputStream (java.io.ObjectInputStream)1 ObjectOutputStream (java.io.ObjectOutputStream)1 Arrays (java.util.Arrays)1 Collection (java.util.Collection)1 List (java.util.List)1 Map (java.util.Map)1 Optional (java.util.Optional)1 Properties (java.util.Properties)1 Set (java.util.Set)1 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)1 ExecutorService (java.util.concurrent.ExecutorService)1 Executors (java.util.concurrent.Executors)1