Examples with DataFile - org.apache.iceberg.DataFile

Example 1 with DataFile

use of org.apache.iceberg.DataFile in project drill by apache.

the class OperationTransformer method toOverwrite.

public Overwrite toOverwrite(String location, Expression expression, List<T> units) {
    WriteData writeData = context.transformer().inputData().units(units).execute();
    File file = context.fileWriter().records(writeData.records()).location(location).name(UUID.randomUUID().toString()).write();
    DataFile dataFile = DataFiles.builder(context.table().spec()).withInputFile(file.input()).withMetrics(file.metrics()).withPartition(writeData.partition()).build();
    return new Overwrite(dataFile, expression);
}

Also used : DataFile(org.apache.iceberg.DataFile) Overwrite(org.apache.drill.metastore.iceberg.operate.Overwrite) File(org.apache.drill.metastore.iceberg.write.File) DataFile(org.apache.iceberg.DataFile)

Example 2 with DataFile

use of org.apache.iceberg.DataFile in project presto by prestodb.

the class FilesTable method buildPages.

private static List<Page> buildPages(ConnectorTableMetadata tableMetadata, ConnectorSession session, Table icebergTable, Optional<Long> snapshotId) {
    PageListBuilder pagesBuilder = forTable(tableMetadata);
    TableScan tableScan = getTableScan(TupleDomain.all(), snapshotId, icebergTable).includeColumnStats();
    Map<Integer, Type> idToTypeMap = getIdToTypeMap(icebergTable.schema());
    tableScan.planFiles().forEach(fileScanTask -> {
        DataFile dataFile = fileScanTask.file();
        pagesBuilder.beginRow();
        pagesBuilder.appendVarchar(dataFile.path().toString());
        pagesBuilder.appendVarchar(dataFile.format().name());
        pagesBuilder.appendBigint(dataFile.recordCount());
        pagesBuilder.appendBigint(dataFile.fileSizeInBytes());
        if (checkNonNull(dataFile.columnSizes(), pagesBuilder)) {
            pagesBuilder.appendIntegerBigintMap(dataFile.columnSizes());
        }
        if (checkNonNull(dataFile.valueCounts(), pagesBuilder)) {
            pagesBuilder.appendIntegerBigintMap(dataFile.valueCounts());
        }
        if (checkNonNull(dataFile.nullValueCounts(), pagesBuilder)) {
            pagesBuilder.appendIntegerBigintMap(dataFile.nullValueCounts());
        }
        if (checkNonNull(dataFile.lowerBounds(), pagesBuilder)) {
            pagesBuilder.appendIntegerVarcharMap(dataFile.lowerBounds().entrySet().stream().collect(toImmutableMap(Map.Entry<Integer, ByteBuffer>::getKey, entry -> Transforms.identity(idToTypeMap.get(entry.getKey())).toHumanString(Conversions.fromByteBuffer(idToTypeMap.get(entry.getKey()), entry.getValue())))));
        }
        if (checkNonNull(dataFile.upperBounds(), pagesBuilder)) {
            pagesBuilder.appendIntegerVarcharMap(dataFile.upperBounds().entrySet().stream().collect(toImmutableMap(Map.Entry<Integer, ByteBuffer>::getKey, entry -> Transforms.identity(idToTypeMap.get(entry.getKey())).toHumanString(Conversions.fromByteBuffer(idToTypeMap.get(entry.getKey()), entry.getValue())))));
        }
        if (checkNonNull(dataFile.keyMetadata(), pagesBuilder)) {
            pagesBuilder.appendVarbinary(Slices.wrappedBuffer(dataFile.keyMetadata()));
        }
        if (checkNonNull(dataFile.splitOffsets(), pagesBuilder)) {
            pagesBuilder.appendBigintArray(dataFile.splitOffsets());
        }
        pagesBuilder.endRow();
    });
    return pagesBuilder.build();
}

Also used : DataFile(org.apache.iceberg.DataFile) PageListBuilder(com.facebook.presto.iceberg.util.PageListBuilder) IcebergUtil.getTableScan(com.facebook.presto.iceberg.IcebergUtil.getTableScan) TableScan(org.apache.iceberg.TableScan) ArrayType(com.facebook.presto.common.type.ArrayType) Type(org.apache.iceberg.types.Type)

Example 3 with DataFile

use of org.apache.iceberg.DataFile in project presto by prestodb.

the class PartitionTable method getPartitions.

private Map<StructLikeWrapper, Partition> getPartitions(TableScan tableScan) {
    try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
        Map<StructLikeWrapper, Partition> partitions = new HashMap<>();
        for (FileScanTask fileScanTask : fileScanTasks) {
            DataFile dataFile = fileScanTask.file();
            Types.StructType structType = fileScanTask.spec().partitionType();
            StructLike partitionStruct = dataFile.partition();
            StructLikeWrapper partitionWrapper = StructLikeWrapper.forType(structType).set(partitionStruct);
            if (!partitions.containsKey(partitionWrapper)) {
                Partition partition = new Partition(idToTypeMapping, nonPartitionPrimitiveColumns, partitionStruct, dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(dataFile.lowerBounds()), toMap(dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
                partitions.put(partitionWrapper, partition);
                continue;
            }
            Partition partition = partitions.get(partitionWrapper);
            partition.incrementFileCount();
            partition.incrementRecordCount(dataFile.recordCount());
            partition.incrementSize(dataFile.fileSizeInBytes());
            partition.updateMin(toMap(dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
            partition.updateMax(toMap(dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
            partition.updateNullCount(dataFile.nullValueCounts());
        }
        return partitions;
    } catch (IOException e) {
        throw new UncheckedIOException(e);
    }
}

Also used : DataFile(org.apache.iceberg.DataFile) Types(org.apache.iceberg.types.Types) HashMap(java.util.HashMap) StructLikeWrapper(org.apache.iceberg.util.StructLikeWrapper) UncheckedIOException(java.io.UncheckedIOException) StructLike(org.apache.iceberg.StructLike) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) FileScanTask(org.apache.iceberg.FileScanTask)

Example 4 with DataFile

use of org.apache.iceberg.DataFile in project hive by apache.

the class HiveIcebergOutputCommitter method commitTask.

/**
 * Collects the generated data files and creates a commit file storing the data file list.
 * @param originalContext The task attempt context
 * @throws IOException Thrown if there is an error writing the commit file
 */
@Override
public void commitTask(TaskAttemptContext originalContext) throws IOException {
    TaskAttemptContext context = TezUtil.enrichContextWithAttemptWrapper(originalContext);
    TaskAttemptID attemptID = context.getTaskAttemptID();
    JobConf jobConf = context.getJobConf();
    Collection<String> outputs = HiveIcebergStorageHandler.outputTables(context.getJobConf());
    Map<String, HiveIcebergRecordWriter> writers = Optional.ofNullable(HiveIcebergRecordWriter.getWriters(attemptID)).orElseGet(() -> {
        LOG.info("CommitTask found no writers for output tables: {}, attemptID: {}", outputs, attemptID);
        return ImmutableMap.of();
    });
    ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size());
    try {
        // Generates commit files for the target tables in parallel
        Tasks.foreach(outputs).retry(3).stopOnFailure().throwFailureWhenFinished().executeWith(tableExecutor).run(output -> {
            Table table = HiveIcebergStorageHandler.table(context.getJobConf(), output);
            if (table != null) {
                HiveIcebergRecordWriter writer = writers.get(output);
                DataFile[] closedFiles;
                if (writer != null) {
                    closedFiles = writer.dataFiles().toArray(new DataFile[0]);
                } else {
                    LOG.info("CommitTask found no writer for specific table: {}, attemptID: {}", output, attemptID);
                    closedFiles = new DataFile[0];
                }
                String fileForCommitLocation = generateFileForCommitLocation(table.location(), jobConf, attemptID.getJobID(), attemptID.getTaskID().getId());
                // Creating the file containing the data files generated by this task for this table
                createFileForCommit(closedFiles, fileForCommitLocation, table.io());
            } else {
                // When using Tez multi-table inserts, we could have more output tables in config than
                // the actual tables this task has written to and has serialized in its config
                LOG.info("CommitTask found no serialized table in config for table: {}.", output);
            }
        }, IOException.class);
    } finally {
        if (tableExecutor != null) {
            tableExecutor.shutdown();
        }
    }
    // remove the writer to release the object
    HiveIcebergRecordWriter.removeWriters(attemptID);
}

Also used : DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) ExecutorService(java.util.concurrent.ExecutorService) TaskAttemptContext(org.apache.hadoop.mapred.TaskAttemptContext) JobConf(org.apache.hadoop.mapred.JobConf)

Example 5 with DataFile

use of org.apache.iceberg.DataFile in project hive by apache.

the class HiveIcebergOutputCommitter method dataFiles.

/**
 * Get the committed data files for this table and job.
 *
 * @param numTasks Number of writer tasks that produced a forCommit file
 * @param executor The executor used for reading the forCommit files parallel
 * @param location The location of the table
 * @param jobContext The job context
 * @param io The FileIO used for reading a files generated for commit
 * @param throwOnFailure If <code>true</code> then it throws an exception on failure
 * @return The list of the committed data files
 */
private static Collection<DataFile> dataFiles(int numTasks, ExecutorService executor, String location, JobContext jobContext, FileIO io, boolean throwOnFailure) {
    JobConf conf = jobContext.getJobConf();
    Collection<DataFile> dataFiles = new ConcurrentLinkedQueue<>();
    // Reading the committed files. The assumption here is that the taskIds are generated in sequential order
    // starting from 0.
    Tasks.range(numTasks).throwFailureWhenFinished(throwOnFailure).executeWith(executor).retry(3).run(taskId -> {
        String taskFileName = generateFileForCommitLocation(location, conf, jobContext.getJobID(), taskId);
        dataFiles.addAll(Arrays.asList(readFileForCommit(taskFileName, io)));
    });
    return dataFiles;
}

Also used : DataFile(org.apache.iceberg.DataFile) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

DataFile (org.apache.iceberg.DataFile)25 Table (org.apache.iceberg.Table)14 Test (org.junit.Test)12 IOException (java.io.IOException)7 ExecutorService (java.util.concurrent.ExecutorService)6 AppendFiles (org.apache.iceberg.AppendFiles)5 Record (org.apache.iceberg.data.Record)5 List (java.util.List)4 Map (java.util.Map)4 Path (org.apache.hadoop.fs.Path)4 ImmutableMap (org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap)4 Types (org.apache.iceberg.types.Types)4 File (java.io.File)3 ArrayList (java.util.ArrayList)3 Optional (java.util.Optional)3 Set (java.util.Set)3 Collectors (java.util.stream.Collectors)3 JobConf (org.apache.hadoop.mapred.JobConf)3 DeleteFile (org.apache.iceberg.DeleteFile)3 Transaction (org.apache.iceberg.Transaction)3