use of org.apache.iceberg.FileFormat in project presto by prestodb.
the class IcebergHadoopMetadata method beginCreateTable.
@Override
public ConnectorOutputTableHandle beginCreateTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, Optional<ConnectorNewTableLayout> layout) {
SchemaTableName schemaTableName = tableMetadata.getTable();
String schemaName = schemaTableName.getSchemaName();
String tableName = schemaTableName.getTableName();
Schema schema = toIcebergSchema(tableMetadata.getColumns());
PartitionSpec partitionSpec = parsePartitionFields(schema, getPartitioning(tableMetadata.getProperties()));
ImmutableMap.Builder<String, String> propertiesBuilder = ImmutableMap.builder();
FileFormat fileFormat = getFileFormat(tableMetadata.getProperties());
propertiesBuilder.put(DEFAULT_FILE_FORMAT, fileFormat.toString());
if (tableMetadata.getComment().isPresent()) {
propertiesBuilder.put(TABLE_COMMENT, tableMetadata.getComment().get());
}
String formatVersion = getFormatVersion(tableMetadata.getProperties());
if (formatVersion != null) {
propertiesBuilder.put(FORMAT_VERSION, formatVersion);
}
try {
transaction = resourceFactory.getCatalog(session).newCreateTableTransaction(toIcebergTableIdentifier(schemaTableName), schema, partitionSpec, propertiesBuilder.build());
} catch (AlreadyExistsException e) {
throw new TableAlreadyExistsException(schemaTableName);
}
Table icebergTable = transaction.table();
return new IcebergWritableTableHandle(schemaName, tableName, SchemaParser.toJson(icebergTable.schema()), PartitionSpecParser.toJson(icebergTable.spec()), getColumns(icebergTable.schema(), typeManager), icebergTable.location(), fileFormat, icebergTable.properties());
}
use of org.apache.iceberg.FileFormat in project hive by apache.
the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied.
@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied() throws IOException {
Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
// add some more data to the same partition
shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
tbl.refresh();
// delete the first and third rows from the newly-added data file
DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
List<Record> rowsToDel = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Laura", "Yellow").add(0L, "Blake", "Blue").build();
List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, rowsToDel.get(0)), positionDelete(dataFile.path(), 2L, rowsToDel.get(1)));
DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
tbl.newRowDelta().addDeletes(deleteFile).commit();
List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
Assert.assertEquals(4, objects.size());
Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
use of org.apache.iceberg.FileFormat in project hive by apache.
the class HiveIcebergStorageHandlerWithEngineBase method parameters.
@Parameters(name = "fileFormat={0}, engine={1}, catalog={2}, isVectorized={3}")
public static Collection<Object[]> parameters() {
Collection<Object[]> testParams = new ArrayList<>();
String javaVersion = System.getProperty("java.specification.version");
// Run tests with every FileFormat for a single Catalog (HiveCatalog)
for (FileFormat fileFormat : HiveIcebergStorageHandlerTestUtils.FILE_FORMATS) {
for (String engine : EXECUTION_ENGINES) {
// include Tez tests only for Java 8
if (javaVersion.equals("1.8")) {
testParams.add(new Object[] { fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, false });
// test for vectorization=ON in case of ORC and PARQUET format with Tez engine
if ((fileFormat == FileFormat.ORC || fileFormat == FileFormat.PARQUET) && "tez".equals(engine) && MetastoreUtil.hive3PresentOnClasspath()) {
testParams.add(new Object[] { fileFormat, engine, TestTables.TestTableType.HIVE_CATALOG, true });
}
}
}
}
// skip HiveCatalog tests as they are added before
for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) {
if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) {
testParams.add(new Object[] { FileFormat.PARQUET, "tez", testTableType, false });
}
}
return testParams;
}
use of org.apache.iceberg.FileFormat in project hive by apache.
the class TestHiveIcebergPartitions method testPartitionPruning.
@Test
public void testPartitionPruning() throws IOException {
Schema salesSchema = new Schema(required(1, "ss_item_sk", Types.IntegerType.get()), required(2, "ss_sold_date_sk", Types.IntegerType.get()));
PartitionSpec salesSpec = PartitionSpec.builderFor(salesSchema).identity("ss_sold_date_sk").build();
Schema dimSchema = new Schema(required(1, "d_date_sk", Types.IntegerType.get()), required(2, "d_moy", Types.IntegerType.get()));
List<Record> salesRecords = TestHelper.RecordsBuilder.newInstance(salesSchema).add(51, 5).add(61, 6).add(71, 7).add(81, 8).add(91, 9).build();
List<Record> dimRecords = TestHelper.RecordsBuilder.newInstance(salesSchema).add(1, 10).add(2, 20).add(3, 30).add(4, 40).add(5, 50).build();
Table salesTable = testTables.createTable(shell, "x1_store_sales", salesSchema, salesSpec, fileFormat, null);
PartitionKey partitionKey = new PartitionKey(salesSpec, salesSchema);
for (Record r : salesRecords) {
partitionKey.partition(r);
testTables.appendIcebergTable(shell.getHiveConf(), salesTable, fileFormat, partitionKey, ImmutableList.of(r));
}
testTables.createTable(shell, "x1_date_dim", dimSchema, fileFormat, dimRecords);
String query = "select s.ss_item_sk from x1_store_sales s, x1_date_dim d " + "where s.ss_sold_date_sk=d.d_date_sk*2 and d.d_moy=30";
// Check the query results
List<Object[]> rows = shell.executeStatement(query);
Assert.assertEquals(1, rows.size());
Assert.assertArrayEquals(new Object[] { 61 }, rows.get(0));
// Check if Dynamic Partitioning is used
Assert.assertTrue(shell.executeStatement("explain " + query).stream().filter(a -> ((String) a[0]).contains("Dynamic Partitioning Event Operator")).findAny().isPresent());
}
use of org.apache.iceberg.FileFormat in project hive by apache.
the class HiveIcebergOutputFormat method writer.
private static HiveIcebergRecordWriter writer(JobConf jc) {
TaskAttemptID taskAttemptID = TezUtil.taskAttemptWrapper(jc);
// It gets the config from the FileSinkOperator which has its own config for every target table
Table table = HiveIcebergStorageHandler.table(jc, jc.get(hive_metastoreConstants.META_TABLE_NAME));
Schema schema = HiveIcebergStorageHandler.schema(jc);
PartitionSpec spec = table.spec();
FileFormat fileFormat = FileFormat.valueOf(PropertyUtil.propertyAsString(table.properties(), TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT).toUpperCase(Locale.ENGLISH));
long targetFileSize = PropertyUtil.propertyAsLong(table.properties(), TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
FileIO io = table.io();
int partitionId = taskAttemptID.getTaskID().getId();
int taskId = taskAttemptID.getId();
String operationId = jc.get(HiveConf.ConfVars.HIVEQUERYID.varname) + "-" + taskAttemptID.getJobID();
OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, taskId).format(fileFormat).operationId(operationId).build();
String tableName = jc.get(Catalogs.NAME);
HiveFileWriterFactory hfwf = new HiveFileWriterFactory(table, fileFormat, schema, null, fileFormat, null, null, null, null);
return new HiveIcebergRecordWriter(schema, spec, fileFormat, hfwf, outputFileFactory, io, targetFileSize, taskAttemptID, tableName);
}
Aggregations