Search in sources :

Example 6 with FileFormat

use of org.apache.iceberg.FileFormat in project hive by apache.

the class HiveVectorizedReader method reader.

public static <D> CloseableIterable<D> reader(InputFile inputFile, FileScanTask task, Map<Integer, ?> idToConstant, TaskAttemptContext context) {
    // Tweaks on jobConf here are relevant for this task only, so we need to copy it first as context's conf is reused..
    JobConf job = new JobConf((JobConf) context.getConfiguration());
    Path path = new Path(inputFile.location());
    FileFormat format = task.file().format();
    Reporter reporter = ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context).getLegacyReporter();
    // Hive by default requires partition columns to be read too. This is not required for identity partition
    // columns, as we will add this as constants later.
    int[] partitionColIndices = null;
    Object[] partitionValues = null;
    PartitionSpec partitionSpec = task.spec();
    List<Integer> readColumnIds = ColumnProjectionUtils.getReadColumnIDs(job);
    if (!partitionSpec.isUnpartitioned()) {
        List<PartitionField> fields = partitionSpec.fields();
        List<Integer> partitionColIndicesList = Lists.newLinkedList();
        List<Object> partitionValuesList = Lists.newLinkedList();
        for (PartitionField partitionField : fields) {
            if (partitionField.transform().isIdentity()) {
                // Get columns in read schema order (which matches those of readColumnIds) to find partition column indices
                List<Types.NestedField> columns = task.spec().schema().columns();
                for (int colIdx = 0; colIdx < columns.size(); ++colIdx) {
                    if (columns.get(colIdx).fieldId() == partitionField.sourceId()) {
                        // Skip reading identity partition columns from source file...
                        readColumnIds.remove((Integer) colIdx);
                        // ...and use the corresponding constant value instead
                        partitionColIndicesList.add(colIdx);
                        partitionValuesList.add(idToConstant.get(partitionField.sourceId()));
                        break;
                    }
                }
            }
        }
        partitionColIndices = ArrayUtils.toPrimitive(partitionColIndicesList.toArray(new Integer[0]));
        partitionValues = partitionValuesList.toArray(new Object[0]);
        ColumnProjectionUtils.setReadColumns(job, readColumnIds);
    }
    try {
        long start = task.start();
        long length = task.length();
        // TODO: Iceberg currently does not track the last modification time of a file. Until that's added,
        // we need to set Long.MIN_VALUE as last modification time in the fileId triplet.
        SyntheticFileId fileId = new SyntheticFileId(path, task.file().fileSizeInBytes(), Long.MIN_VALUE);
        RecordReader<NullWritable, VectorizedRowBatch> recordReader = null;
        switch(format) {
            case ORC:
                recordReader = orcRecordReader(job, reporter, task, inputFile, path, start, length, readColumnIds, fileId);
                break;
            case PARQUET:
                recordReader = parquetRecordReader(job, reporter, task, path, start, length);
                break;
            default:
                throw new UnsupportedOperationException("Vectorized Hive reading unimplemented for format: " + format);
        }
        return createVectorizedRowBatchIterable(recordReader, job, partitionColIndices, partitionValues);
    } catch (IOException ioe) {
        throw new RuntimeException("Error creating vectorized record reader for " + inputFile, ioe);
    }
}
Also used : SyntheticFileId(org.apache.hadoop.hive.ql.io.SyntheticFileId) FileFormat(org.apache.iceberg.FileFormat) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) PartitionField(org.apache.iceberg.PartitionField) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) Reporter(org.apache.hadoop.mapred.Reporter) IOException(java.io.IOException) PartitionSpec(org.apache.iceberg.PartitionSpec) NullWritable(org.apache.hadoop.io.NullWritable)

Example 7 with FileFormat

use of org.apache.iceberg.FileFormat in project hive by apache.

the class HiveIcebergTestUtils method createEqualityDeleteFile.

/**
 * @param table The table to create the delete file for
 * @param deleteFilePath The path where the delete file should be created, relative to the table location root
 * @param equalityFields List of field names that should play a role in the equality check
 * @param fileFormat The file format that should be used for writing out the delete file
 * @param rowsToDelete The rows that should be deleted. It's enough to fill out the fields that are relevant for the
 *                     equality check, as listed in equalityFields, the rest of the fields are ignored
 * @return The DeleteFile created
 * @throws IOException If there is an error during DeleteFile write
 */
public static DeleteFile createEqualityDeleteFile(Table table, String deleteFilePath, List<String> equalityFields, FileFormat fileFormat, List<Record> rowsToDelete) throws IOException {
    List<Integer> equalityFieldIds = equalityFields.stream().map(id -> table.schema().findField(id).fieldId()).collect(Collectors.toList());
    Schema eqDeleteRowSchema = table.schema().select(equalityFields.toArray(new String[] {}));
    FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), ArrayUtil.toIntArray(equalityFieldIds), eqDeleteRowSchema, null);
    EncryptedOutputFile outputFile = table.encryption().encrypt(HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(table.location(), deleteFilePath), new Configuration()));
    PartitionKey part = new PartitionKey(table.spec(), eqDeleteRowSchema);
    part.partition(rowsToDelete.get(0));
    EqualityDeleteWriter<Record> eqWriter = appenderFactory.newEqDeleteWriter(outputFile, fileFormat, part);
    try (EqualityDeleteWriter<Record> writer = eqWriter) {
        writer.deleteAll(rowsToDelete);
    }
    return eqWriter.toDeleteFile();
}
Also used : Arrays(java.util.Arrays) HadoopOutputFile(org.apache.iceberg.hadoop.HadoopOutputFile) Types(org.apache.iceberg.types.Types) Text(org.apache.hadoop.io.Text) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) DateWritable(org.apache.hadoop.hive.serde2.io.DateWritable) StandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector) JobID(org.apache.hadoop.mapred.JobID) LongWritable(org.apache.hadoop.io.LongWritable) ByteBuffer(java.nio.ByteBuffer) BigDecimal(java.math.BigDecimal) TimestampUtils(org.apache.hadoop.hive.common.type.TimestampUtils) ArrayUtil(org.apache.iceberg.util.ArrayUtil) ByteBuffers(org.apache.iceberg.util.ByteBuffers) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) GenericRecord(org.apache.iceberg.data.GenericRecord) PositionDeleteWriter(org.apache.iceberg.deletes.PositionDeleteWriter) LocalTime(java.time.LocalTime) PartitionKey(org.apache.iceberg.PartitionKey) ZoneOffset(java.time.ZoneOffset) Path(java.nio.file.Path) IntWritable(org.apache.hadoop.io.IntWritable) CloseableIterable(org.apache.iceberg.io.CloseableIterable) Timestamp(java.sql.Timestamp) UUID(java.util.UUID) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) List(java.util.List) OffsetDateTime(java.time.OffsetDateTime) BooleanWritable(org.apache.hadoop.io.BooleanWritable) EncryptedOutputFile(org.apache.iceberg.encryption.EncryptedOutputFile) LocalDate(java.time.LocalDate) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory) PositionDelete(org.apache.iceberg.deletes.PositionDelete) LocalDateTime(java.time.LocalDateTime) IcebergGenerics(org.apache.iceberg.data.IcebergGenerics) DoubleWritable(org.apache.hadoop.io.DoubleWritable) ArrayList(java.util.ArrayList) BytesWritable(org.apache.hadoop.io.BytesWritable) TimestampWritable(org.apache.hadoop.hive.serde2.io.TimestampWritable) Files(java.nio.file.Files) Table(org.apache.iceberg.Table) EqualityDeleteWriter(org.apache.iceberg.deletes.EqualityDeleteWriter) IOException(java.io.IOException) FileFormat(org.apache.iceberg.FileFormat) File(java.io.File) Record(org.apache.iceberg.data.Record) ObjectInspectorFactory(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory) Paths(java.nio.file.Paths) TimestampTZUtil(org.apache.hadoop.hive.common.type.TimestampTZUtil) PrimitiveObjectInspectorFactory(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) FileAppenderFactory(org.apache.iceberg.io.FileAppenderFactory) DeleteFile(org.apache.iceberg.DeleteFile) Comparator(java.util.Comparator) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) Assert(org.junit.Assert) FloatWritable(org.apache.hadoop.io.FloatWritable) Path(java.nio.file.Path) Configuration(org.apache.hadoop.conf.Configuration) EncryptedOutputFile(org.apache.iceberg.encryption.EncryptedOutputFile) Schema(org.apache.iceberg.Schema) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory) PartitionKey(org.apache.iceberg.PartitionKey) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record)

Example 8 with FileFormat

use of org.apache.iceberg.FileFormat in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add some more data to the same partition
    shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
    tbl.refresh();
    // delete the first and third rows from the newly-added data file - with row supplied
    DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
    List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, null), positionDelete(dataFile.path(), 2L, null));
    DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
    Assert.assertEquals(4, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
    Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
Also used : DataFile(org.apache.iceberg.DataFile) Types(org.apache.iceberg.types.Types) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) StreamSupport(java.util.stream.StreamSupport) DeleteFile(org.apache.iceberg.DeleteFile) Assume(org.junit.Assume) DataFile(org.apache.iceberg.DataFile) Assert(org.junit.Assert) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Table(org.apache.iceberg.Table) PositionDelete(org.apache.iceberg.deletes.PositionDelete) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 9 with FileFormat

use of org.apache.iceberg.FileFormat in project hive by apache.

the class TestHiveIcebergOutputCommitter method writeRecords.

/**
 * Write random records to the given table using separate {@link HiveIcebergOutputCommitter} and
 * a separate {@link HiveIcebergRecordWriter} for every task.
 * @param name The name of the table to get the table object from the conf
 * @param taskNum The number of tasks in the job handled by the committer
 * @param attemptNum The id used for attempt number generation
 * @param commitTasks If <code>true</code> the tasks will be committed
 * @param abortTasks If <code>true</code> the tasks will be aborted - needed so we can simulate no commit/no abort
 *                   situation
 * @param conf The job configuration
 * @param committer The output committer that should be used for committing/aborting the tasks
 * @return The random generated records which were appended to the table
 * @throws IOException Propagating {@link HiveIcebergRecordWriter} exceptions
 */
private List<Record> writeRecords(String name, int taskNum, int attemptNum, boolean commitTasks, boolean abortTasks, JobConf conf, OutputCommitter committer) throws IOException {
    List<Record> expected = new ArrayList<>(RECORD_NUM * taskNum);
    Table table = HiveIcebergStorageHandler.table(conf, name);
    FileIO io = table.io();
    Schema schema = HiveIcebergStorageHandler.schema(conf);
    PartitionSpec spec = table.spec();
    for (int i = 0; i < taskNum; ++i) {
        List<Record> records = TestHelper.generateRandomRecords(schema, RECORD_NUM, i + attemptNum);
        TaskAttemptID taskId = new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.MAP, i, attemptNum);
        int partitionId = taskId.getTaskID().getId();
        String operationId = QUERY_ID + "-" + JOB_ID;
        FileFormat fileFormat = FileFormat.PARQUET;
        OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, attemptNum).format(fileFormat).operationId(operationId).build();
        HiveFileWriterFactory hfwf = new HiveFileWriterFactory(table, fileFormat, schema, null, fileFormat, null, null, null, null);
        HiveIcebergRecordWriter testWriter = new HiveIcebergRecordWriter(schema, spec, fileFormat, hfwf, outputFileFactory, io, TARGET_FILE_SIZE, TezUtil.taskAttemptWrapper(taskId), conf.get(Catalogs.NAME));
        Container<Record> container = new Container<>();
        for (Record record : records) {
            container.set(record);
            testWriter.write(container);
        }
        testWriter.close(false);
        if (commitTasks) {
            committer.commitTask(new TaskAttemptContextImpl(conf, taskId));
            expected.addAll(records);
        } else if (abortTasks) {
            committer.abortTask(new TaskAttemptContextImpl(conf, taskId));
        }
    }
    return expected;
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) Table(org.apache.iceberg.Table) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) Schema(org.apache.iceberg.Schema) ArrayList(java.util.ArrayList) FileFormat(org.apache.iceberg.FileFormat) PartitionSpec(org.apache.iceberg.PartitionSpec) FileIO(org.apache.iceberg.io.FileIO) Container(org.apache.iceberg.mr.mapred.Container) TaskAttemptContextImpl(org.apache.hadoop.mapred.TaskAttemptContextImpl) Record(org.apache.iceberg.data.Record)

Example 10 with FileFormat

use of org.apache.iceberg.FileFormat in project presto by prestodb.

the class IcebergPageSink method createWriter.

private WriteContext createWriter(Optional<PartitionData> partitionData) {
    String fileName = fileFormat.addExtension(randomUUID().toString());
    Path outputPath = partitionData.map(partition -> new Path(locationProvider.newDataLocation(partitionSpec, partition, fileName))).orElse(new Path(locationProvider.newDataLocation(fileName)));
    IcebergFileWriter writer = fileWriterFactory.createFileWriter(outputPath, outputSchema, jobConf, session, hdfsContext, fileFormat);
    return new WriteContext(writer, outputPath, partitionData);
}
Also used : Path(org.apache.hadoop.fs.Path) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) JsonCodec(com.facebook.airlift.json.JsonCodec) Page(com.facebook.presto.common.Page) CompletableFuture.completedFuture(java.util.concurrent.CompletableFuture.completedFuture) PartitionField(org.apache.iceberg.PartitionField) LocationProvider(org.apache.iceberg.io.LocationProvider) Slices.wrappedBuffer(io.airlift.slice.Slices.wrappedBuffer) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Transform(org.apache.iceberg.transforms.Transform) Map(java.util.Map) ConnectorPageSink(com.facebook.presto.spi.ConnectorPageSink) PageIndexerFactory(com.facebook.presto.spi.PageIndexerFactory) Path(org.apache.hadoop.fs.Path) Decimals.readBigDecimal(com.facebook.presto.common.type.Decimals.readBigDecimal) HdfsContext(com.facebook.presto.hive.HdfsContext) TinyintType(com.facebook.presto.common.type.TinyintType) BigintType(com.facebook.presto.common.type.BigintType) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) VarcharType(com.facebook.presto.common.type.VarcharType) RealType(com.facebook.presto.common.type.RealType) Schema(org.apache.iceberg.Schema) String.format(java.lang.String.format) ConnectorSession(com.facebook.presto.spi.ConnectorSession) SmallintType(com.facebook.presto.common.type.SmallintType) List(java.util.List) PartitionSpec(org.apache.iceberg.PartitionSpec) VarbinaryType(com.facebook.presto.common.type.VarbinaryType) Optional(java.util.Optional) DecimalType(com.facebook.presto.common.type.DecimalType) Slice(io.airlift.slice.Slice) BooleanType(com.facebook.presto.common.type.BooleanType) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) PrestoException(com.facebook.presto.spi.PrestoException) Float.intBitsToFloat(java.lang.Float.intBitsToFloat) Function(java.util.function.Function) ArrayList(java.util.ArrayList) IntegerType(com.facebook.presto.common.type.IntegerType) ImmutableList(com.google.common.collect.ImmutableList) PartitionTransforms.getColumnTransform(com.facebook.presto.iceberg.PartitionTransforms.getColumnTransform) Verify.verify(com.google.common.base.Verify.verify) ConfigurationUtils.toJobConf(com.facebook.presto.hive.util.ConfigurationUtils.toJobConf) Objects.requireNonNull(java.util.Objects.requireNonNull) Math.toIntExact(java.lang.Math.toIntExact) Type(com.facebook.presto.common.type.Type) PageIndexer(com.facebook.presto.spi.PageIndexer) ICEBERG_TOO_MANY_OPEN_PARTITIONS(com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_TOO_MANY_OPEN_PARTITIONS) FileFormat(org.apache.iceberg.FileFormat) JobConf(org.apache.hadoop.mapred.JobConf) ColumnTransform(com.facebook.presto.iceberg.PartitionTransforms.ColumnTransform) UUID.randomUUID(java.util.UUID.randomUUID) Block(com.facebook.presto.common.block.Block) DateType(com.facebook.presto.common.type.DateType) DoubleType(com.facebook.presto.common.type.DoubleType)

Aggregations

FileFormat (org.apache.iceberg.FileFormat)11 PartitionSpec (org.apache.iceberg.PartitionSpec)9 Schema (org.apache.iceberg.Schema)9 Table (org.apache.iceberg.Table)7 IOException (java.io.IOException)5 List (java.util.List)5 Record (org.apache.iceberg.data.Record)5 ArrayList (java.util.ArrayList)4 Types (org.apache.iceberg.types.Types)4 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)4 Assert (org.junit.Assert)4 Path (org.apache.hadoop.fs.Path)3 DeleteFile (org.apache.iceberg.DeleteFile)3 PositionDelete (org.apache.iceberg.deletes.PositionDelete)3 HdfsContext (com.facebook.presto.hive.HdfsContext)2 TableAlreadyExistsException (com.facebook.presto.hive.TableAlreadyExistsException)2 PrestoException (com.facebook.presto.spi.PrestoException)2 Map (java.util.Map)2 JobConf (org.apache.hadoop.mapred.JobConf)2 TaskAttemptID (org.apache.hadoop.mapred.TaskAttemptID)2