use of org.apache.iceberg.ContentFile in project iceberg by apache.
the class TestRewriteDataFilesAction method testBinPackWithDeletes.
@Test
public void testBinPackWithDeletes() throws Exception {
Table table = createTablePartitioned(4, 2);
table.updateProperties().set(TableProperties.FORMAT_VERSION, "2").commit();
shouldHaveFiles(table, 8);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum();
RowDelta rowDelta = table.newRowDelta();
// add 1 delete file for data files 0, 1, 2
for (int i = 0; i < 3; i++) {
writePosDeletesToFile(table, dataFiles.get(i), 1).forEach(rowDelta::addDeletes);
}
// add 2 delete files for data files 3, 4
for (int i = 3; i < 5; i++) {
writePosDeletesToFile(table, dataFiles.get(i), 2).forEach(rowDelta::addDeletes);
}
rowDelta.commit();
table.refresh();
List<Object[]> expectedRecords = currentData();
Result result = actions().rewriteDataFiles(table).option(BinPackStrategy.MIN_FILE_SIZE_BYTES, "0").option(RewriteDataFiles.TARGET_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE - 1)).option(BinPackStrategy.MAX_FILE_SIZE_BYTES, Long.toString(Long.MAX_VALUE)).option(BinPackStrategy.DELETE_FILE_THRESHOLD, "2").execute();
Assert.assertEquals("Action should rewrite 2 data files", 2, result.rewrittenDataFilesCount());
List<Object[]> actualRecords = currentData();
assertEquals("Rows must match", expectedRecords, actualRecords);
Assert.assertEquals("7 rows are removed", total - 7, actualRecords.size());
}
use of org.apache.iceberg.ContentFile in project hive by apache.
the class HiveIcebergOutputCommitter method abortJob.
/**
* Removes the generated data files if there is a commit file already generated for them.
* The cleanup at the end removes the temporary directories as well.
* @param originalContext The job context
* @param status The status of the job
* @throws IOException if there is a failure deleting the files
*/
@Override
public void abortJob(JobContext originalContext, int status) throws IOException {
JobContext jobContext = TezUtil.enrichContextWithVertexId(originalContext);
JobConf jobConf = jobContext.getJobConf();
LOG.info("Job {} is aborted. Data file cleaning started", jobContext.getJobID());
Collection<String> outputs = HiveIcebergStorageHandler.outputTables(jobContext.getJobConf());
Collection<String> jobLocations = new ConcurrentLinkedQueue<>();
ExecutorService fileExecutor = fileExecutor(jobConf);
ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size());
try {
// Cleans up the changes for the output tables in parallel
Tasks.foreach(outputs).suppressFailureWhenFinished().executeWith(tableExecutor).onFailure((output, exc) -> LOG.warn("Failed cleanup table {} on abort job", output, exc)).run(output -> {
LOG.info("Cleaning job for jobID: {}, table: {}", jobContext.getJobID(), output);
Table table = HiveIcebergStorageHandler.table(jobConf, output);
String jobLocation = generateJobLocation(table.location(), jobConf, jobContext.getJobID());
jobLocations.add(jobLocation);
// list jobLocation to get number of forCommit files
// we do this because map/reduce num in jobConf is unreliable and we have no access to vertex status info
int numTasks = listForCommits(jobConf, jobLocation).size();
FilesForCommit results = collectResults(numTasks, fileExecutor, table.location(), jobContext, table.io(), false);
// Check if we have files already written and remove data and delta files if there are any
Collection<ContentFile> files = Stream.concat(results.dataFiles().stream(), results.deleteFiles().stream()).collect(Collectors.toList());
if (files.size() > 0) {
Tasks.foreach(files).retry(3).suppressFailureWhenFinished().executeWith(fileExecutor).onFailure((file, exc) -> LOG.warn("Failed to remove data file {} on abort job", file.path(), exc)).run(file -> table.io().deleteFile(file.path().toString()));
}
}, IOException.class);
} finally {
fileExecutor.shutdown();
if (tableExecutor != null) {
tableExecutor.shutdown();
}
}
LOG.info("Job {} is aborted. Data file cleaning finished", jobContext.getJobID());
cleanup(jobContext, jobLocations);
}
use of org.apache.iceberg.ContentFile in project iceberg by apache.
the class TestRewriteDataFilesAction method testRewriteAvoidRepeateCompress.
/**
* a test case to test avoid repeate compress
* <p>
* If datafile cannot be combined to CombinedScanTask with other DataFiles, the size of the CombinedScanTask list size
* is 1, so we remove these CombinedScanTasks to avoid compressed repeatedly.
* <p>
* In this test case,we generated 3 data files and set targetSizeInBytes greater than the largest file size so that it
* cannot be combined a CombinedScanTask with other datafiles. The datafile with the largest file size will not be
* compressed.
*
* @throws IOException IOException
*/
@Test
public void testRewriteAvoidRepeateCompress() throws IOException {
Assume.assumeFalse("ORC does not support getting length when file is opening", format.equals(FileFormat.ORC));
List<Record> expected = Lists.newArrayList();
Schema schema = icebergTableUnPartitioned.schema();
GenericAppenderFactory genericAppenderFactory = new GenericAppenderFactory(schema);
File file = temp.newFile();
int count = 0;
try (FileAppender<Record> fileAppender = genericAppenderFactory.newAppender(Files.localOutput(file), format)) {
long filesize = 20000;
for (; fileAppender.length() < filesize; count++) {
Record record = SimpleDataUtil.createRecord(count, "iceberg");
fileAppender.add(record);
expected.add(record);
}
}
DataFile dataFile = DataFiles.builder(icebergTableUnPartitioned.spec()).withPath(file.getAbsolutePath()).withFileSizeInBytes(file.length()).withFormat(format).withRecordCount(count).build();
icebergTableUnPartitioned.newAppend().appendFile(dataFile).commit();
sql("INSERT INTO %s SELECT 1,'a' ", TABLE_NAME_UNPARTITIONED);
sql("INSERT INTO %s SELECT 2,'b' ", TABLE_NAME_UNPARTITIONED);
icebergTableUnPartitioned.refresh();
CloseableIterable<FileScanTask> tasks = icebergTableUnPartitioned.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
Assert.assertEquals("Should have 3 data files before rewrite", 3, dataFiles.size());
Actions actions = Actions.forTable(icebergTableUnPartitioned);
long targetSizeInBytes = file.length() + 10;
RewriteDataFilesActionResult result = actions.rewriteDataFiles().targetSizeInBytes(targetSizeInBytes).splitOpenFileCost(1).execute();
Assert.assertEquals("Action should rewrite 2 data files", 2, result.deletedDataFiles().size());
Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size());
icebergTableUnPartitioned.refresh();
CloseableIterable<FileScanTask> tasks1 = icebergTableUnPartitioned.newScan().planFiles();
List<DataFile> dataFilesRewrote = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file));
Assert.assertEquals("Should have 2 data files after rewrite", 2, dataFilesRewrote.size());
// the biggest file do not be rewrote
List rewroteDataFileNames = dataFilesRewrote.stream().map(ContentFile::path).collect(Collectors.toList());
Assert.assertTrue(rewroteDataFileNames.contains(file.getAbsolutePath()));
// Assert the table records as expected.
expected.add(SimpleDataUtil.createRecord(1, "a"));
expected.add(SimpleDataUtil.createRecord(2, "b"));
SimpleDataUtil.assertTableRecords(icebergTableUnPartitioned, expected);
}
use of org.apache.iceberg.ContentFile in project iceberg by apache.
the class TestRewriteDataFilesAction method testBinPackWithDeleteAllData.
@Test
public void testBinPackWithDeleteAllData() {
Map<String, String> options = Maps.newHashMap();
options.put(TableProperties.FORMAT_VERSION, "2");
Table table = createTablePartitioned(1, 1, 1, options);
shouldHaveFiles(table, 1);
table.refresh();
CloseableIterable<FileScanTask> tasks = table.newScan().planFiles();
List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file));
int total = (int) dataFiles.stream().mapToLong(ContentFile::recordCount).sum();
RowDelta rowDelta = table.newRowDelta();
// remove all data
writePosDeletesToFile(table, dataFiles.get(0), total).forEach(rowDelta::addDeletes);
rowDelta.commit();
table.refresh();
List<Object[]> expectedRecords = currentData();
Result result = actions().rewriteDataFiles(table).option(BinPackStrategy.DELETE_FILE_THRESHOLD, "1").execute();
Assert.assertEquals("Action should rewrite 1 data files", 1, result.rewrittenDataFilesCount());
List<Object[]> actualRecords = currentData();
assertEquals("Rows must match", expectedRecords, actualRecords);
Assert.assertEquals("Data manifest should not have existing data file", 0, (long) table.currentSnapshot().dataManifests().get(0).existingFilesCount());
Assert.assertEquals("Data manifest should have 1 delete data file", 1L, (long) table.currentSnapshot().dataManifests().get(0).deletedFilesCount());
Assert.assertEquals("Delete manifest added row count should equal total count", total, (long) table.currentSnapshot().deleteManifests().get(0).addedRowsCount());
}
Aggregations