use of org.apache.iceberg.DataFile in project drill by apache.
the class OperationTransformer method toOverwrite.
public Overwrite toOverwrite(String location, Expression expression, List<T> units) {
WriteData writeData = context.transformer().inputData().units(units).execute();
File file = context.fileWriter().records(writeData.records()).location(location).name(UUID.randomUUID().toString()).write();
DataFile dataFile = DataFiles.builder(context.table().spec()).withInputFile(file.input()).withMetrics(file.metrics()).withPartition(writeData.partition()).build();
return new Overwrite(dataFile, expression);
}
use of org.apache.iceberg.DataFile in project presto by prestodb.
the class FilesTable method buildPages.
private static List<Page> buildPages(ConnectorTableMetadata tableMetadata, ConnectorSession session, Table icebergTable, Optional<Long> snapshotId) {
PageListBuilder pagesBuilder = forTable(tableMetadata);
TableScan tableScan = getTableScan(TupleDomain.all(), snapshotId, icebergTable).includeColumnStats();
Map<Integer, Type> idToTypeMap = getIdToTypeMap(icebergTable.schema());
tableScan.planFiles().forEach(fileScanTask -> {
DataFile dataFile = fileScanTask.file();
pagesBuilder.beginRow();
pagesBuilder.appendVarchar(dataFile.path().toString());
pagesBuilder.appendVarchar(dataFile.format().name());
pagesBuilder.appendBigint(dataFile.recordCount());
pagesBuilder.appendBigint(dataFile.fileSizeInBytes());
if (checkNonNull(dataFile.columnSizes(), pagesBuilder)) {
pagesBuilder.appendIntegerBigintMap(dataFile.columnSizes());
}
if (checkNonNull(dataFile.valueCounts(), pagesBuilder)) {
pagesBuilder.appendIntegerBigintMap(dataFile.valueCounts());
}
if (checkNonNull(dataFile.nullValueCounts(), pagesBuilder)) {
pagesBuilder.appendIntegerBigintMap(dataFile.nullValueCounts());
}
if (checkNonNull(dataFile.lowerBounds(), pagesBuilder)) {
pagesBuilder.appendIntegerVarcharMap(dataFile.lowerBounds().entrySet().stream().collect(toImmutableMap(Map.Entry<Integer, ByteBuffer>::getKey, entry -> Transforms.identity(idToTypeMap.get(entry.getKey())).toHumanString(Conversions.fromByteBuffer(idToTypeMap.get(entry.getKey()), entry.getValue())))));
}
if (checkNonNull(dataFile.upperBounds(), pagesBuilder)) {
pagesBuilder.appendIntegerVarcharMap(dataFile.upperBounds().entrySet().stream().collect(toImmutableMap(Map.Entry<Integer, ByteBuffer>::getKey, entry -> Transforms.identity(idToTypeMap.get(entry.getKey())).toHumanString(Conversions.fromByteBuffer(idToTypeMap.get(entry.getKey()), entry.getValue())))));
}
if (checkNonNull(dataFile.keyMetadata(), pagesBuilder)) {
pagesBuilder.appendVarbinary(Slices.wrappedBuffer(dataFile.keyMetadata()));
}
if (checkNonNull(dataFile.splitOffsets(), pagesBuilder)) {
pagesBuilder.appendBigintArray(dataFile.splitOffsets());
}
pagesBuilder.endRow();
});
return pagesBuilder.build();
}
use of org.apache.iceberg.DataFile in project presto by prestodb.
the class PartitionTable method getPartitions.
private Map<StructLikeWrapper, Partition> getPartitions(TableScan tableScan) {
try (CloseableIterable<FileScanTask> fileScanTasks = tableScan.planFiles()) {
Map<StructLikeWrapper, Partition> partitions = new HashMap<>();
for (FileScanTask fileScanTask : fileScanTasks) {
DataFile dataFile = fileScanTask.file();
Types.StructType structType = fileScanTask.spec().partitionType();
StructLike partitionStruct = dataFile.partition();
StructLikeWrapper partitionWrapper = StructLikeWrapper.forType(structType).set(partitionStruct);
if (!partitions.containsKey(partitionWrapper)) {
Partition partition = new Partition(idToTypeMapping, nonPartitionPrimitiveColumns, partitionStruct, dataFile.recordCount(), dataFile.fileSizeInBytes(), toMap(dataFile.lowerBounds()), toMap(dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.columnSizes());
partitions.put(partitionWrapper, partition);
continue;
}
Partition partition = partitions.get(partitionWrapper);
partition.incrementFileCount();
partition.incrementRecordCount(dataFile.recordCount());
partition.incrementSize(dataFile.fileSizeInBytes());
partition.updateMin(toMap(dataFile.lowerBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
partition.updateMax(toMap(dataFile.upperBounds()), dataFile.nullValueCounts(), dataFile.recordCount());
partition.updateNullCount(dataFile.nullValueCounts());
}
return partitions;
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class HiveIcebergOutputCommitter method commitTask.
/**
* Collects the generated data files and creates a commit file storing the data file list.
* @param originalContext The task attempt context
* @throws IOException Thrown if there is an error writing the commit file
*/
@Override
public void commitTask(TaskAttemptContext originalContext) throws IOException {
TaskAttemptContext context = TezUtil.enrichContextWithAttemptWrapper(originalContext);
TaskAttemptID attemptID = context.getTaskAttemptID();
JobConf jobConf = context.getJobConf();
Collection<String> outputs = HiveIcebergStorageHandler.outputTables(context.getJobConf());
Map<String, HiveIcebergRecordWriter> writers = Optional.ofNullable(HiveIcebergRecordWriter.getWriters(attemptID)).orElseGet(() -> {
LOG.info("CommitTask found no writers for output tables: {}, attemptID: {}", outputs, attemptID);
return ImmutableMap.of();
});
ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size());
try {
// Generates commit files for the target tables in parallel
Tasks.foreach(outputs).retry(3).stopOnFailure().throwFailureWhenFinished().executeWith(tableExecutor).run(output -> {
Table table = HiveIcebergStorageHandler.table(context.getJobConf(), output);
if (table != null) {
HiveIcebergRecordWriter writer = writers.get(output);
DataFile[] closedFiles;
if (writer != null) {
closedFiles = writer.dataFiles().toArray(new DataFile[0]);
} else {
LOG.info("CommitTask found no writer for specific table: {}, attemptID: {}", output, attemptID);
closedFiles = new DataFile[0];
}
String fileForCommitLocation = generateFileForCommitLocation(table.location(), jobConf, attemptID.getJobID(), attemptID.getTaskID().getId());
// Creating the file containing the data files generated by this task for this table
createFileForCommit(closedFiles, fileForCommitLocation, table.io());
} else {
// When using Tez multi-table inserts, we could have more output tables in config than
// the actual tables this task has written to and has serialized in its config
LOG.info("CommitTask found no serialized table in config for table: {}.", output);
}
}, IOException.class);
} finally {
if (tableExecutor != null) {
tableExecutor.shutdown();
}
}
// remove the writer to release the object
HiveIcebergRecordWriter.removeWriters(attemptID);
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class HiveIcebergOutputCommitter method dataFiles.
/**
* Get the committed data files for this table and job.
*
* @param numTasks Number of writer tasks that produced a forCommit file
* @param executor The executor used for reading the forCommit files parallel
* @param location The location of the table
* @param jobContext The job context
* @param io The FileIO used for reading a files generated for commit
* @param throwOnFailure If <code>true</code> then it throws an exception on failure
* @return The list of the committed data files
*/
private static Collection<DataFile> dataFiles(int numTasks, ExecutorService executor, String location, JobContext jobContext, FileIO io, boolean throwOnFailure) {
JobConf conf = jobContext.getJobConf();
Collection<DataFile> dataFiles = new ConcurrentLinkedQueue<>();
// Reading the committed files. The assumption here is that the taskIds are generated in sequential order
// starting from 0.
Tasks.range(numTasks).throwFailureWhenFinished(throwOnFailure).executeWith(executor).retry(3).run(taskId -> {
String taskFileName = generateFileForCommitLocation(location, conf, jobContext.getJobID(), taskId);
dataFiles.addAll(Arrays.asList(readFileForCommit(taskFileName, io)));
});
return dataFiles;
}
Aggregations