Examples with DataFile - org.apache.iceberg.DataFile

Example 6 with DataFile

use of org.apache.iceberg.DataFile in project hive by apache.

the class HiveIcebergOutputCommitter method commitTable.

/**
 * Collects the additions to a single table and adds/commits the new files to the Iceberg table.
 * @param io The io to read the forCommit files
 * @param executor The executor used to read the forCommit files
 * @param jobContext The job context
 * @param name The name of the table used for loading from the catalog
 * @param location The location of the table used for loading from the catalog
 * @param catalogName The name of the catalog that contains the table
 */
private void commitTable(FileIO io, ExecutorService executor, JobContext jobContext, String name, String location, String catalogName) {
    JobConf conf = jobContext.getJobConf();
    Properties catalogProperties = new Properties();
    catalogProperties.put(Catalogs.NAME, name);
    catalogProperties.put(Catalogs.LOCATION, location);
    if (catalogName != null) {
        catalogProperties.put(InputFormatConfig.CATALOG_NAME, catalogName);
    }
    Table table = Catalogs.loadTable(conf, catalogProperties);
    long startTime = System.currentTimeMillis();
    LOG.info("Committing job has started for table: {}, using location: {}", table, generateJobLocation(location, conf, jobContext.getJobID()));
    int numTasks = SessionStateUtil.getCommitInfo(conf, name).map(info -> info.getTaskNum()).orElseGet(() -> {
        // Fallback logic, if number of tasks are not available in the config
        // If there are reducers, then every reducer will generate a result file.
        // If this is a map only task, then every mapper will generate a result file.
        LOG.info("Number of tasks not available in session state for jobID: {}, table: {}. Falling back to jobConf " + "numReduceTasks/numMapTasks", jobContext.getJobID(), name);
        return conf.getNumReduceTasks() > 0 ? conf.getNumReduceTasks() : conf.getNumMapTasks();
    });
    Collection<DataFile> dataFiles = dataFiles(numTasks, executor, location, jobContext, io, true);
    boolean isOverwrite = conf.getBoolean(InputFormatConfig.IS_OVERWRITE, false);
    if (isOverwrite) {
        if (!dataFiles.isEmpty()) {
            ReplacePartitions overwrite = table.newReplacePartitions();
            dataFiles.forEach(overwrite::addFile);
            overwrite.commit();
            LOG.info("Overwrite commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
        } else if (table.spec().isUnpartitioned()) {
            // TODO: we won't get here if we have a formerly-partitioned table, whose partition specs have been turned void
            table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();
            LOG.info("Cleared table contents as part of empty overwrite for unpartitioned table. " + "Commit took {} ms for table: {}", System.currentTimeMillis() - startTime, table);
        }
        LOG.debug("Overwrote partitions with files {}", dataFiles);
    } else if (dataFiles.size() > 0) {
        // Appending data files to the table
        // We only create a new commit if there's something to append
        AppendFiles append = table.newAppend();
        dataFiles.forEach(append::appendFile);
        append.commit();
        LOG.info("Append commit took {} ms for table: {} with {} file(s)", System.currentTimeMillis() - startTime, table, dataFiles.size());
        LOG.debug("Added files {}", dataFiles);
    } else {
        LOG.info("Not creating a new commit for table: {}, jobID: {}, since there were no new files to append", table, jobContext.getJobID());
    }
}

Also used : NotFoundException(org.apache.iceberg.exceptions.NotFoundException) Arrays(java.util.Arrays) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) FileSystem(org.apache.hadoop.fs.FileSystem) Catalogs(org.apache.iceberg.mr.Catalogs) ObjectInputStream(java.io.ObjectInputStream) LoggerFactory(org.slf4j.LoggerFactory) AppendFiles(org.apache.iceberg.AppendFiles) OutputFile(org.apache.iceberg.io.OutputFile) FileStatus(org.apache.hadoop.fs.FileStatus) TaskType(org.apache.hadoop.mapreduce.TaskType) OutputCommitter(org.apache.hadoop.mapred.OutputCommitter) TaskAttemptContext(org.apache.hadoop.mapred.TaskAttemptContext) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) ObjectOutputStream(java.io.ObjectOutputStream) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) JobID(org.apache.hadoop.mapreduce.JobID) DataFile(org.apache.iceberg.DataFile) ExecutorService(java.util.concurrent.ExecutorService) Properties(java.util.Properties) Logger(org.slf4j.Logger) Table(org.apache.iceberg.Table) Collection(java.util.Collection) HiveConf(org.apache.hadoop.hive.conf.HiveConf) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) ThreadFactoryBuilder(org.apache.iceberg.relocated.com.google.common.util.concurrent.ThreadFactoryBuilder) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) JobConf(org.apache.hadoop.mapred.JobConf) Util(org.apache.iceberg.hadoop.Util) JobContext(org.apache.hadoop.mapred.JobContext) ReplacePartitions(org.apache.iceberg.ReplacePartitions) Tasks(org.apache.iceberg.util.Tasks) Optional(java.util.Optional) SessionStateUtil(org.apache.hadoop.hive.ql.session.SessionStateUtil) Expressions(org.apache.iceberg.expressions.Expressions) FileIO(org.apache.iceberg.io.FileIO) VisibleForTesting(org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) AppendFiles(org.apache.iceberg.AppendFiles) Properties(java.util.Properties) JobConf(org.apache.hadoop.mapred.JobConf) ReplacePartitions(org.apache.iceberg.ReplacePartitions)

Example 7 with DataFile

use of org.apache.iceberg.DataFile in project hive by apache.

the class HiveTableTest method testDropTable.

@Test
public void testDropTable() throws IOException {
    Table table = catalog.loadTable(TABLE_IDENTIFIER);
    GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test"));
    List<GenericData.Record> records = Lists.newArrayList(recordBuilder.set("id", 1L).build(), recordBuilder.set("id", 2L).build(), recordBuilder.set("id", 3L).build());
    String location1 = table.location().replace("file:", "") + "/data/file1.avro";
    try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(location1)).schema(schema).named("test").build()) {
        for (GenericData.Record rec : records) {
            writer.add(rec);
        }
    }
    String location2 = table.location().replace("file:", "") + "/data/file2.avro";
    try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(location2)).schema(schema).named("test").build()) {
        for (GenericData.Record rec : records) {
            writer.add(rec);
        }
    }
    DataFile file1 = DataFiles.builder(table.spec()).withRecordCount(3).withPath(location1).withFileSizeInBytes(Files.localInput(location2).getLength()).build();
    DataFile file2 = DataFiles.builder(table.spec()).withRecordCount(3).withPath(location2).withFileSizeInBytes(Files.localInput(location1).getLength()).build();
    // add both data files
    table.newAppend().appendFile(file1).appendFile(file2).commit();
    // delete file2
    table.newDelete().deleteFile(file2.path()).commit();
    String manifestListLocation = table.currentSnapshot().manifestListLocation().replace("file:", "");
    List<ManifestFile> manifests = table.currentSnapshot().allManifests();
    Assert.assertTrue("Drop (table and data) should return true and drop the table", catalog.dropTable(TABLE_IDENTIFIER));
    Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));
    Assert.assertFalse("Table data files should not exist", new File(location1).exists());
    Assert.assertFalse("Table data files should not exist", new File(location2).exists());
    Assert.assertFalse("Table manifest list files should not exist", new File(manifestListLocation).exists());
    for (ManifestFile manifest : manifests) {
        Assert.assertFalse("Table manifest files should not exist", new File(manifest.path().replace("file:", "")).exists());
    }
    Assert.assertFalse("Table metadata file should not exist", new File(((HasTableOperations) table).operations().current().metadataFileLocation().replace("file:", "")).exists());
}

Also used : Table(org.apache.iceberg.Table) PosixFilePermissions.fromString(java.nio.file.attribute.PosixFilePermissions.fromString) GenericData(org.apache.hive.iceberg.org.apache.avro.generic.GenericData) ManifestFile(org.apache.iceberg.ManifestFile) DataFile(org.apache.iceberg.DataFile) GenericRecordBuilder(org.apache.hive.iceberg.org.apache.avro.generic.GenericRecordBuilder) HasTableOperations(org.apache.iceberg.HasTableOperations) DataFile(org.apache.iceberg.DataFile) ManifestFile(org.apache.iceberg.ManifestFile) File(java.io.File) Test(org.junit.Test)

Example 8 with DataFile

use of org.apache.iceberg.DataFile in project hive by apache.

the class HiveTableTest method testDropWithoutPurgeLeavesTableData.

@Test
public void testDropWithoutPurgeLeavesTableData() throws IOException {
    Table table = catalog.loadTable(TABLE_IDENTIFIER);
    GenericRecordBuilder recordBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert(schema, "test"));
    List<GenericData.Record> records = Lists.newArrayList(recordBuilder.set("id", 1L).build(), recordBuilder.set("id", 2L).build(), recordBuilder.set("id", 3L).build());
    String fileLocation = table.location().replace("file:", "") + "/data/file.avro";
    try (FileAppender<GenericData.Record> writer = Avro.write(Files.localOutput(fileLocation)).schema(schema).named("test").build()) {
        for (GenericData.Record rec : records) {
            writer.add(rec);
        }
    }
    DataFile file = DataFiles.builder(table.spec()).withRecordCount(3).withPath(fileLocation).withFileSizeInBytes(Files.localInput(fileLocation).getLength()).build();
    table.newAppend().appendFile(file).commit();
    String manifestListLocation = table.currentSnapshot().manifestListLocation().replace("file:", "");
    Assert.assertTrue("Drop should return true and drop the table", catalog.dropTable(TABLE_IDENTIFIER, false));
    Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));
    Assert.assertTrue("Table data files should exist", new File(fileLocation).exists());
    Assert.assertTrue("Table metadata files should exist", new File(manifestListLocation).exists());
}

Also used : DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) GenericRecordBuilder(org.apache.hive.iceberg.org.apache.avro.generic.GenericRecordBuilder) PosixFilePermissions.fromString(java.nio.file.attribute.PosixFilePermissions.fromString) GenericData(org.apache.hive.iceberg.org.apache.avro.generic.GenericData) DataFile(org.apache.iceberg.DataFile) ManifestFile(org.apache.iceberg.ManifestFile) File(java.io.File) Test(org.junit.Test)

Example 9 with DataFile

use of org.apache.iceberg.DataFile in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add some more data to the same partition
    shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
    tbl.refresh();
    // delete the first and third rows from the newly-added data file
    DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
    List<Record> rowsToDel = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Laura", "Yellow").add(0L, "Blake", "Blue").build();
    List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, rowsToDel.get(0)), positionDelete(dataFile.path(), 2L, rowsToDel.get(1)));
    DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
    Assert.assertEquals(4, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
    Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}

Also used : DataFile(org.apache.iceberg.DataFile) Types(org.apache.iceberg.types.Types) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) StreamSupport(java.util.stream.StreamSupport) DeleteFile(org.apache.iceberg.DeleteFile) Assume(org.junit.Assume) DataFile(org.apache.iceberg.DataFile) Assert(org.junit.Assert) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Table(org.apache.iceberg.Table) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 10 with DataFile

use of org.apache.iceberg.DataFile in project hive by apache.

the class TestIcebergInputFormats method testFilterExp.

@Test
public void testFilterExp() throws Exception {
    helper.createTable();
    List<Record> expectedRecords = helper.generateRandomRecords(2, 0L);
    expectedRecords.get(0).set(2, "2020-03-20");
    expectedRecords.get(1).set(2, "2020-03-20");
    DataFile dataFile1 = helper.writeFile(Row.of("2020-03-20", 0), expectedRecords);
    DataFile dataFile2 = helper.writeFile(Row.of("2020-03-21", 0), helper.generateRandomRecords(2, 0L));
    helper.appendToTable(dataFile1, dataFile2);
    builder.filter(Expressions.equal("date", "2020-03-20"));
    testInputFormat.create(builder.conf()).validate(expectedRecords);
}

Also used : DataFile(org.apache.iceberg.DataFile) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Aggregations

DataFile (org.apache.iceberg.DataFile)25 Table (org.apache.iceberg.Table)14 Test (org.junit.Test)12 IOException (java.io.IOException)7 ExecutorService (java.util.concurrent.ExecutorService)6 AppendFiles (org.apache.iceberg.AppendFiles)5 Record (org.apache.iceberg.data.Record)5 List (java.util.List)4 Map (java.util.Map)4 Path (org.apache.hadoop.fs.Path)4 ImmutableMap (org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap)4 Types (org.apache.iceberg.types.Types)4 File (java.io.File)3 ArrayList (java.util.ArrayList)3 Optional (java.util.Optional)3 Set (java.util.Set)3 Collectors (java.util.stream.Collectors)3 JobConf (org.apache.hadoop.mapred.JobConf)3 DeleteFile (org.apache.iceberg.DeleteFile)3 Transaction (org.apache.iceberg.Transaction)3