use of org.apache.iceberg.DataFile in project hive by apache.
the class GenericAppenderHelper method writeFile.
public DataFile writeFile(StructLike partition, List<Record> records) throws IOException {
Preconditions.checkNotNull(table, "table not set");
File file = tmp.newFile();
Assert.assertTrue(file.delete());
return appendToLocalFile(table, file, fileFormat, partition, records, conf);
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class GenericAppenderHelper method appendToTable.
public void appendToTable(DataFile... dataFiles) {
Preconditions.checkNotNull(table, "table not set");
AppendFiles append = table.newAppend();
for (DataFile dataFile : dataFiles) {
append = append.appendFile(dataFile);
}
append.commit();
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class HiveTableUtil method importFiles.
/**
* Import files from given partitions to an Iceberg table.
* @param sourceLocation location of the HMS table
* @param format inputformat class name of the HMS table
* @param partitionSpecProxy list of HMS table partitions wrapped in partitionSpecProxy
* @param partitionKeys list of partition keys
* @param icebergTableProperties destination iceberg table properties
* @param conf a Hadoop configuration
*/
public static void importFiles(String sourceLocation, String format, PartitionSpecProxy partitionSpecProxy, List<FieldSchema> partitionKeys, Properties icebergTableProperties, Configuration conf) throws MetaException {
RemoteIterator<LocatedFileStatus> filesIterator = null;
// this operation must be done before the iceberg table is created
if (partitionSpecProxy.size() == 0) {
filesIterator = getFilesIterator(new Path(sourceLocation), conf);
}
Table icebergTable = Catalogs.createTable(conf, icebergTableProperties);
AppendFiles append = icebergTable.newAppend();
PartitionSpec spec = icebergTable.spec();
MetricsConfig metricsConfig = MetricsConfig.fromProperties(icebergTable.properties());
String nameMappingString = icebergTable.properties().get(TableProperties.DEFAULT_NAME_MAPPING);
NameMapping nameMapping = nameMappingString != null ? NameMappingParser.fromJson(nameMappingString) : null;
try {
if (partitionSpecProxy.size() == 0) {
List<DataFile> dataFiles = getDataFiles(filesIterator, Collections.emptyMap(), format, spec, metricsConfig, nameMapping, conf);
dataFiles.forEach(append::appendFile);
} else {
PartitionSpecProxy.PartitionIterator partitionIterator = partitionSpecProxy.getPartitionIterator();
List<Callable<Void>> tasks = new ArrayList<>();
while (partitionIterator.hasNext()) {
Partition partition = partitionIterator.next();
Callable<Void> task = () -> {
Path partitionPath = new Path(partition.getSd().getLocation());
String partitionName = Warehouse.makePartName(partitionKeys, partition.getValues());
Map<String, String> partitionSpec = Warehouse.makeSpecFromName(partitionName);
RemoteIterator<LocatedFileStatus> iterator = getFilesIterator(partitionPath, conf);
List<DataFile> dataFiles = getDataFiles(iterator, partitionSpec, format.toLowerCase(), spec, metricsConfig, nameMapping, conf);
synchronized (append) {
dataFiles.forEach(append::appendFile);
}
return null;
};
tasks.add(task);
}
int numThreads = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_SERVER2_ICEBERG_METADATA_GENERATOR_THREADS);
ExecutorService executor = Executors.newFixedThreadPool(numThreads, new ThreadFactoryBuilder().setNameFormat("iceberg-metadata-generator-%d").setDaemon(true).build());
executor.invokeAll(tasks);
executor.shutdown();
}
append.commit();
} catch (IOException | InterruptedException e) {
throw new MetaException("Cannot import hive data into iceberg table.\n" + e.getMessage());
}
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class HiveIcebergOutputCommitter method abortJob.
/**
* Removes the generated data files if there is a commit file already generated for them.
* The cleanup at the end removes the temporary directories as well.
* @param originalContext The job context
* @param status The status of the job
* @throws IOException if there is a failure deleting the files
*/
@Override
public void abortJob(JobContext originalContext, int status) throws IOException {
JobContext jobContext = TezUtil.enrichContextWithVertexId(originalContext);
JobConf jobConf = jobContext.getJobConf();
LOG.info("Job {} is aborted. Data file cleaning started", jobContext.getJobID());
Collection<String> outputs = HiveIcebergStorageHandler.outputTables(jobContext.getJobConf());
Collection<String> jobLocations = new ConcurrentLinkedQueue<>();
ExecutorService fileExecutor = fileExecutor(jobConf);
ExecutorService tableExecutor = tableExecutor(jobConf, outputs.size());
try {
// Cleans up the changes for the output tables in parallel
Tasks.foreach(outputs).suppressFailureWhenFinished().executeWith(tableExecutor).onFailure((output, exc) -> LOG.warn("Failed cleanup table {} on abort job", output, exc)).run(output -> {
LOG.info("Cleaning job for jobID: {}, table: {}", jobContext.getJobID(), output);
Table table = HiveIcebergStorageHandler.table(jobConf, output);
String jobLocation = generateJobLocation(table.location(), jobConf, jobContext.getJobID());
jobLocations.add(jobLocation);
// list jobLocation to get number of forCommit files
// we do this because map/reduce num in jobConf is unreliable and we have no access to vertex status info
int numTasks = listForCommits(jobConf, jobLocation).size();
Collection<DataFile> dataFiles = dataFiles(numTasks, fileExecutor, table.location(), jobContext, table.io(), false);
// Check if we have files already committed and remove data files if there are any
if (dataFiles.size() > 0) {
Tasks.foreach(dataFiles).retry(3).suppressFailureWhenFinished().executeWith(fileExecutor).onFailure((file, exc) -> LOG.warn("Failed to remove data file {} on abort job", file.path(), exc)).run(file -> table.io().deleteFile(file.path().toString()));
}
}, IOException.class);
} finally {
fileExecutor.shutdown();
if (tableExecutor != null) {
tableExecutor.shutdown();
}
}
LOG.info("Job {} is aborted. Data file cleaning finished", jobContext.getJobID());
cleanup(jobContext, jobLocations);
}
use of org.apache.iceberg.DataFile in project hive by apache.
the class HiveCreateReplaceTableTest method testCreateTableTxnWithGlobalTableLocation.
@Test
public void testCreateTableTxnWithGlobalTableLocation() {
Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));
Transaction txn = catalog.newCreateTableTransaction(TABLE_IDENTIFIER, SCHEMA, SPEC, "file:///" + tableLocation, Maps.newHashMap());
txn.commitTransaction();
Table table = catalog.loadTable(TABLE_IDENTIFIER);
DataFile dataFile = DataFiles.builder(SPEC).withPath("/path/to/data-a.parquet").withFileSizeInBytes(0).withRecordCount(1).build();
table.newAppend().appendFile(dataFile).commit();
Assert.assertEquals("Write should succeed", 1, Iterables.size(table.snapshots()));
}
Aggregations