use of org.apache.iceberg.DataFile in project hive by apache.
the class HiveIcebergOutputCommitter method commitTable.
/**
* Collects the additions to a single table and adds/commits the new files to the Iceberg table.
* @param io The io to read the forCommit files
* @param executor The executor used to read the forCommit files
* @param jobContext The job context
* @param name The name of the table used for loading from the catalog
* @param location The location of the table used for loading from the catalog
* @param catalogName The name of the catalog that contains the table
*/
private void commitTable(FileIO io, ExecutorService executor, JobContext jobContext, String name, String location, String catalogName) {
JobConf conf = jobContext.getJobConf();
Properties catalogProperties = new Properties();
catalogProperties.put(Catalogs.NAME, name);
catalogProperties.put(Catalogs.LOCATION, location);
if (catalogName != null) {
catalogProperties.put(InputFormatConfig.CATALOG_NAME, catalogName);
}
Table table = Catalogs.loadTable(conf, catalogProperties);
long startTime = System.currentTimeMillis();
LOG.info("Committing job has started for table: {}, using location: {}", table, generateJobLocation(location, conf, jobContext.getJobID()));
int numTasks = SessionStateUtil.getCommitInfo(conf, name).map(info -> info.getTaskNum()).orElseGet(() -> {
// Fallback logic, if number of tasks are not available in the config
// If there are reducers, then every reducer will generate a result file.
// If this is a map only task, then every mapper will generate a result file.
LOG.info("Number of tasks not available in session state for jobID: {}, table: {}. Falling back to jobConf " + "numReduceTasks/numMapTasks", jobContext.getJobID(), name);
return conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks();
});
Collection<DataFile> dataFiles = dataFiles(numTasks, executor, location, jobContext, io, true);
boolean isOverwrite = conf.getBoolean(InputFormatConfig.IS_OVERWRITE, false);
if (isOverwrite) {
if (!dataFiles.isEmpty()) {
ReplacePartitions overwrite = table.newReplacePartitions();
dataFiles.forEach(overwrite::addFile);
overwrite.commit();
LOG.info("Overwrite commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
} else if (table.spec().isUnpartitioned()) {
// TODO: we won't get here if we have a formerly-partitioned table, whose partition specs have been turned void
table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();
LOG.info("Cleared table contents as part of empty overwrite for unpartitioned table. " + "Commit took {} ms for table: {}", System.currentTimeMillis() - startTime, table);
}
LOG.debug("Overwrote partitions with files {}", dataFiles);
} else if (dataFiles.size() > 0) {
// Appending data files to the table
// We only create a new commit if there's something to append
AppendFiles append = table.newAppend();
dataFiles.forEach(append::appendFile);
append.commit();
LOG.info("Append commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
LOG.debug("Added files {}", dataFiles);
} else {
LOG.info("Not creating a new commit for table: {}, jobID: {}, since there were no new files to append", table, jobContext.getJobID());
}
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class HiveTableTest method testDropTable.
@Test
public void testDropTable() throws IOException {
Table table = catalog.loadTable(TABLE_IDENTIFIER);
GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test"));
List<GenericData.Record> records = Lists.newArrayList(recordBuilder.set("id", 1L).build(), recordBuilder.set("id", 2L).build(), recordBuilder.set("id", 3L).build());
String location1 = table.location().replace("file:", "") + "/data/file1.avro";
try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(location1)).schema(schema).named("test").build()) {
for (GenericData.Record rec : records) {
writer.add(rec);
}
}
String location2 = table.location().replace("file:", "") + "/data/file2.avro";
try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(location2)).schema(schema).named("test").build()) {
for (GenericData.Record rec : records) {
writer.add(rec);
}
}
DataFile file1 = DataFiles.builder(table.spec()).withRecordCount(3).withPath(location1).withFileSizeInBytes(Files.localInput(location2).getLength()).build();
DataFile file2 = DataFiles.builder(table.spec()).withRecordCount(3).withPath(location2).withFileSizeInBytes(Files.localInput(location1).getLength()).build();
// add both data files
table.newAppend().appendFile(file1).appendFile(file2).commit();
// delete file2
table.newDelete().deleteFile(file2.path()).commit();
String manifestListLocation = table.currentSnapshot().manifestListLocation().replace("file:", "");
List<ManifestFile> manifests = table.currentSnapshot().allManifests();
Assert.assertTrue("Drop (table and data) should return true and drop the table", catalog.dropTable(TABLE_IDENTIFIER));
Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));
Assert.assertFalse("Table data files should not exist", new File(location1).exists());
Assert.assertFalse("Table data files should not exist", new File(location2).exists());
Assert.assertFalse("Table manifest list files should not exist", new File(manifestListLocation).exists());
for (ManifestFile manifest : manifests) {
Assert.assertFalse("Table manifest files should not exist", new File(manifest.path().replace("file:", "")).exists());
}
Assert.assertFalse("Table metadata file should not exist", new File(((HasTableOperations) table).operations().current().metadataFileLocation().replace("file:", "")).exists());
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class HiveTableTest method testDropWithoutPurgeLeavesTableData.
@Test
public void testDropWithoutPurgeLeavesTableData() throws IOException {
Table table = catalog.loadTable(TABLE_IDENTIFIER);
GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test"));
List<GenericData.Record> records = Lists.newArrayList(recordBuilder.set("id", 1L).build(), recordBuilder.set("id", 2L).build(), recordBuilder.set("id", 3L).build());
String fileLocation = table.location().replace("file:", "") + "/data/file.avro";
try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(fileLocation)).schema(schema).named("test").build()) {
for (GenericData.Record rec : records) {
writer.add(rec);
}
}
DataFile file = DataFiles.builder(table.spec()).withRecordCount(3).withPath(fileLocation).withFileSizeInBytes(Files.localInput(fileLocation).getLength()).build();
table.newAppend().appendFile(file).commit();
String manifestListLocation = table.currentSnapshot().manifestListLocation().replace("file:", "");
Assert.assertTrue("Drop should return true and drop the table", catalog.dropTable(TABLE_IDENTIFIER, false));
Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));
Assert.assertTrue("Table data files should exist", new File(fileLocation).exists());
Assert.assertTrue("Table metadata files should exist", new File(manifestListLocation).exists());
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied.
@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied() throws IOException {
Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
// add some more data to the same partition
shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
tbl.refresh();
// delete the first and third rows from the newly-added data file
DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
List<Record> rowsToDel = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Laura", "Yellow").add(0L, "Blake", "Blue").build();
List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, rowsToDel.get(0)), positionDelete(dataFile.path(), 2L, rowsToDel.get(1)));
DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
tbl.newRowDelta().addDeletes(deleteFile).commit();
List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
Assert.assertEquals(4, objects.size());
Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class TestIcebergInputFormats method testFilterExp.
@Test
public void testFilterExp() throws Exception {
helper.createTable();
List<Record> expectedRecords = helper.generateRandomRecords(2, 0L);
expectedRecords.get(0).set(2, "2020-03-20");
expectedRecords.get(1).set(2, "2020-03-20");
DataFile dataFile1 = helper.writeFile(Row.of("2020-03-20", 0), expectedRecords);
DataFile dataFile2 = helper.writeFile(Row.of("2020-03-21", 0), helper.generateRandomRecords(2, 0L));
helper.appendToTable(dataFile1, dataFile2);
builder.filter(Expressions.equal("date", "2020-03-20"));
testInputFormat.create(builder.conf()).validate(expectedRecords);
}
Aggregations