Examples with FileFormat - org.apache.iceberg.FileFormat

Example 1 with FileFormat

use of org.apache.iceberg.FileFormat in project presto by prestodb.

the class IcebergHadoopMetadata method beginCreateTable.

@Override
public ConnectorOutputTableHandle beginCreateTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, Optional<ConnectorNewTableLayout> layout) {
    SchemaTableName schemaTableName = tableMetadata.getTable();
    String schemaName = schemaTableName.getSchemaName();
    String tableName = schemaTableName.getTableName();
    Schema schema = toIcebergSchema(tableMetadata.getColumns());
    PartitionSpec partitionSpec = parsePartitionFields(schema, getPartitioning(tableMetadata.getProperties()));
    ImmutableMap.Builder<String, String> propertiesBuilder = ImmutableMap.builder();
    FileFormat fileFormat = getFileFormat(tableMetadata.getProperties());
    propertiesBuilder.put(DEFAULT_FILE_FORMAT, fileFormat.toString());
    if (tableMetadata.getComment().isPresent()) {
        propertiesBuilder.put(TABLE_COMMENT, tableMetadata.getComment().get());
    }
    String formatVersion = getFormatVersion(tableMetadata.getProperties());
    if (formatVersion != null) {
        propertiesBuilder.put(FORMAT_VERSION, formatVersion);
    }
    try {
        transaction = resourceFactory.getCatalog(session).newCreateTableTransaction(toIcebergTableIdentifier(schemaTableName), schema, partitionSpec, propertiesBuilder.build());
    } catch (AlreadyExistsException e) {
        throw new TableAlreadyExistsException(schemaTableName);
    }
    Table icebergTable = transaction.table();
    return new IcebergWritableTableHandle(schemaName, tableName, SchemaParser.toJson(icebergTable.schema()), PartitionSpecParser.toJson(icebergTable.spec()), getColumns(icebergTable.schema(), typeManager), icebergTable.location(), fileFormat, icebergTable.properties());
}

Also used : TableAlreadyExistsException(com.facebook.presto.hive.TableAlreadyExistsException) SystemTable(com.facebook.presto.spi.SystemTable) IcebergUtil.getHadoopIcebergTable(com.facebook.presto.iceberg.IcebergUtil.getHadoopIcebergTable) Table(org.apache.iceberg.Table) AlreadyExistsException(org.apache.iceberg.exceptions.AlreadyExistsException) TableAlreadyExistsException(com.facebook.presto.hive.TableAlreadyExistsException) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) IcebergTableProperties.getFileFormat(com.facebook.presto.iceberg.IcebergTableProperties.getFileFormat) SchemaTableName(com.facebook.presto.spi.SchemaTableName) PartitionSpec(org.apache.iceberg.PartitionSpec) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap)

Example 2 with FileFormat

use of org.apache.iceberg.FileFormat in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add some more data to the same partition
    shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
    tbl.refresh();
    // delete the first and third rows from the newly-added data file
    DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
    List<Record> rowsToDel = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Laura", "Yellow").add(0L, "Blake", "Blue").build();
    List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, rowsToDel.get(0)), positionDelete(dataFile.path(), 2L, rowsToDel.get(1)));
    DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
    Assert.assertEquals(4, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
    Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}

Also used : DataFile(org.apache.iceberg.DataFile) Types(org.apache.iceberg.types.Types) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) StreamSupport(java.util.stream.StreamSupport) DeleteFile(org.apache.iceberg.DeleteFile) Assume(org.junit.Assume) DataFile(org.apache.iceberg.DataFile) Assert(org.junit.Assert) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Table(org.apache.iceberg.Table) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 3 with FileFormat

use of org.apache.iceberg.FileFormat in project hive by apache.

the class HiveIcebergStorageHandlerWithEngineBase method parameters.

@Parameters(name = "fileFormat={0}, engine={1}, catalog={2}, isVectorized={3}")
public static Collection<Object[]> parameters() {
    Collection<Object[]> testParams = new ArrayList<>();
    String javaVersion = System.getProperty("java.specification.version");
    // Run tests with every FileFormat for a single Catalog (HiveCatalog)
    for (FileFormat fileFormat : HiveIcebergStorageHandlerTestUtils.FILE_FORMATS) {
        for (String engine : EXECUTION_ENGINES) {
            // include Tez tests only for Java 8
            if (javaVersion.equals("1.8")) {
                testParams.add(new Object[] { fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, false });
                // test for vectorization=ON in case of ORC and PARQUET format with Tez engine
                if ((fileFormat == FileFormat.ORC || fileFormat == FileFormat.PARQUET) && "tez".equals(engine) && MetastoreUtil.hive3PresentOnClasspath()) {
                    testParams.add(new Object[] { fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, true });
                }
            }
        }
    }
    // skip HiveCatalog tests as they are added before
    for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) {
        if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) {
            testParams.add(new Object[] { FileFormat.PARQUET, "tez", testTableType, false });
        }
    }
    return testParams;
}

Also used : ArrayList(java.util.ArrayList) FileFormat(org.apache.iceberg.FileFormat) Parameters(org.junit.runners.Parameterized.Parameters)

Example 4 with FileFormat

use of org.apache.iceberg.FileFormat in project hive by apache.

the class TestHiveIcebergPartitions method testPartitionPruning.

@Test
public void testPartitionPruning() throws IOException {
    Schema salesSchema = new Schema(required(1, "ss_item_sk", Types.IntegerType.get()), required(2, "ss_sold_date_sk", Types.IntegerType.get()));
    PartitionSpec salesSpec = PartitionSpec.builderFor(salesSchema).identity("ss_sold_date_sk").build();
    Schema dimSchema = new Schema(required(1, "d_date_sk", Types.IntegerType.get()), required(2, "d_moy", Types.IntegerType.get()));
    List<Record> salesRecords = TestHelper.RecordsBuilder.newInstance(salesSchema).add(51, 5).add(61, 6).add(71, 7).add(81, 8).add(91, 9).build();
    List<Record> dimRecords = TestHelper.RecordsBuilder.newInstance(salesSchema).add(1, 10).add(2, 20).add(3, 30).add(4, 40).add(5, 50).build();
    Table salesTable = testTables.createTable(shell, "x1_store_sales", salesSchema, salesSpec, fileFormat, null);
    PartitionKey partitionKey = new PartitionKey(salesSpec, salesSchema);
    for (Record r : salesRecords) {
        partitionKey.partition(r);
        testTables.appendIcebergTable(shell.getHiveConf(), salesTable, fileFormat, partitionKey, ImmutableList.of(r));
    }
    testTables.createTable(shell, "x1_date_dim", dimSchema, fileFormat, dimRecords);
    String query = "select s.ss_item_sk from x1_store_sales s, x1_date_dim d " + "where s.ss_sold_date_sk=d.d_date_sk*2 and d.d_moy=30";
    // Check the query results
    List<Object[]> rows = shell.executeStatement(query);
    Assert.assertEquals(1, rows.size());
    Assert.assertArrayEquals(new Object[] { 61 }, rows.get(0));
    // Check if Dynamic Partitioning is used
    Assert.assertTrue(shell.executeStatement("explain " + query).stream().filter(a -> ((String) a[0]).contains("Dynamic Partitioning Event Operator")).findAny().isPresent());
}

Also used : Types(org.apache.iceberg.types.Types) Table(org.apache.iceberg.Table) LocalDateTime(java.time.LocalDateTime) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) OffsetDateTime(java.time.OffsetDateTime) NestedField.required(org.apache.iceberg.types.Types.NestedField.required) LocalDate(java.time.LocalDate) PartitionSpec(org.apache.iceberg.PartitionSpec) PartitionKey(org.apache.iceberg.PartitionKey) Assume(org.junit.Assume) ZoneOffset(java.time.ZoneOffset) Assert(org.junit.Assert) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) PartitionKey(org.apache.iceberg.PartitionKey) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 5 with FileFormat

use of org.apache.iceberg.FileFormat in project hive by apache.

the class HiveIcebergOutputFormat method writer.

private static HiveIcebergRecordWriter writer(JobConf jc) {
    TaskAttemptID taskAttemptID = TezUtil.taskAttemptWrapper(jc);
    // It gets the config from the FileSinkOperator which has its own config for every target table
    Table table = HiveIcebergStorageHandler.table(jc, jc.get(hive_metastoreConstants.META_TABLE_NAME));
    Schema schema = HiveIcebergStorageHandler.schema(jc);
    PartitionSpec spec = table.spec();
    FileFormat fileFormat = FileFormat.valueOf(PropertyUtil.propertyAsString(table.properties(), TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT).toUpperCase(Locale.ENGLISH));
    long targetFileSize = PropertyUtil.propertyAsLong(table.properties(), TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
    FileIO io = table.io();
    int partitionId = taskAttemptID.getTaskID().getId();
    int taskId = taskAttemptID.getId();
    String operationId = jc.get(HiveConf.ConfVars.HIVEQUERYID.varname) + "-" + taskAttemptID.getJobID();
    OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, taskId).format(fileFormat).operationId(operationId).build();
    String tableName = jc.get(Catalogs.NAME);
    HiveFileWriterFactory hfwf = new HiveFileWriterFactory(table, fileFormat, schema, null, fileFormat, null, null, null, null);
    return new HiveIcebergRecordWriter(schema, spec, fileFormat, hfwf, outputFileFactory, io, targetFileSize, taskAttemptID, tableName);
}

Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) Table(org.apache.iceberg.Table) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) PartitionSpec(org.apache.iceberg.PartitionSpec) FileIO(org.apache.iceberg.io.FileIO)

Aggregations

FileFormat (org.apache.iceberg.FileFormat)11 PartitionSpec (org.apache.iceberg.PartitionSpec)9 Schema (org.apache.iceberg.Schema)9 Table (org.apache.iceberg.Table)7 IOException (java.io.IOException)5 List (java.util.List)5 Record (org.apache.iceberg.data.Record)5 ArrayList (java.util.ArrayList)4 Types (org.apache.iceberg.types.Types)4 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)4 Assert (org.junit.Assert)4 Path (org.apache.hadoop.fs.Path)3 DeleteFile (org.apache.iceberg.DeleteFile)3 PositionDelete (org.apache.iceberg.deletes.PositionDelete)3 HdfsContext (com.facebook.presto.hive.HdfsContext)2 TableAlreadyExistsException (com.facebook.presto.hive.TableAlreadyExistsException)2 PrestoException (com.facebook.presto.spi.PrestoException)2 Map (java.util.Map)2 JobConf (org.apache.hadoop.mapred.JobConf)2 TaskAttemptID (org.apache.hadoop.mapred.TaskAttemptID)2