use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.
the class FileSet method recomputeStats.
private void recomputeStats() {
this.totalEntities = this.generatedEntities.size();
this.totalSize = 0;
for (CopyEntity copyEntity : this.generatedEntities) {
if (copyEntity instanceof CopyableFile) {
this.totalSize += ((CopyableFile) copyEntity).getOrigin().getLen();
}
}
}
use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.
the class UnpartitionedTableFileSet method generateCopyEntities.
// Suppress warnings for "stepPriority++" in the PrePublishStep constructor, as stepPriority may be used later
@SuppressFBWarnings("DLS_DEAD_LOCAL_STORE")
@Override
protected Collection<CopyEntity> generateCopyEntities() throws IOException {
MultiTimingEvent multiTimer = new MultiTimingEvent(this.helper.getEventSubmitter(), "TableCopy", true);
int stepPriority = 0;
String fileSet = getTable().getTableName();
List<CopyEntity> copyEntities = Lists.newArrayList();
Optional<Table> existingTargetTable = this.helper.getExistingTargetTable();
if (existingTargetTable.isPresent()) {
if (!this.helper.getTargetTable().getDataLocation().equals(existingTargetTable.get().getDataLocation())) {
switch(this.helper.getExistingEntityPolicy()) {
case UPDATE_TABLE:
// Update the location of files while keep the existing table entity.
log.warn("Source table will not be deregistered while file locaiton has been changed, update source table's" + " file location to" + this.helper.getTargetTable().getDataLocation());
existingTargetTable = Optional.absent();
break;
case REPLACE_TABLE:
case REPLACE_TABLE_AND_PARTITIONS:
// Required to de-register the original table.
log.warn("Source and target table are not compatible. Will override target table " + existingTargetTable.get().getDataLocation());
stepPriority = this.helper.addTableDeregisterSteps(copyEntities, fileSet, stepPriority, this.helper.getTargetTable());
existingTargetTable = Optional.absent();
break;
default:
log.error("Source and target table are not compatible. Aborting copy of table " + this.helper.getTargetTable(), new HiveTableLocationNotMatchException(this.helper.getTargetTable().getDataLocation(), existingTargetTable.get().getDataLocation()));
multiTimer.close();
return Lists.newArrayList();
}
}
}
stepPriority = this.helper.addSharedSteps(copyEntities, fileSet, stepPriority);
HiveLocationDescriptor sourceLocation = HiveLocationDescriptor.forTable(getTable(), getHiveDataset().getFs(), getHiveDataset().getProperties());
HiveLocationDescriptor desiredTargetLocation = HiveLocationDescriptor.forTable(this.helper.getTargetTable(), this.helper.getTargetFs(), getHiveDataset().getProperties());
Optional<HiveLocationDescriptor> existingTargetLocation = existingTargetTable.isPresent() ? Optional.of(HiveLocationDescriptor.forTable(existingTargetTable.get(), this.helper.getTargetFs(), getHiveDataset().getProperties())) : Optional.<HiveLocationDescriptor>absent();
if (this.helper.getFastTableSkip().isPresent() && this.helper.getFastTableSkip().get().apply(this.helper)) {
log.info(String.format("Skipping copy of table %s due to fast table skip predicate.", getTable().getDbName() + "." + getTable().getTableName()));
multiTimer.close();
return Lists.newArrayList();
}
HiveCopyEntityHelper.DiffPathSet diffPathSet = HiveCopyEntityHelper.fullPathDiff(sourceLocation, desiredTargetLocation, existingTargetLocation, Optional.<Partition>absent(), multiTimer, this.helper);
multiTimer.nextStage(HiveCopyEntityHelper.Stages.FULL_PATH_DIFF);
// Could used to delete files for the existing snapshot
DeleteFileCommitStep deleteStep = DeleteFileCommitStep.fromPaths(this.helper.getTargetFs(), diffPathSet.pathsToDelete, getHiveDataset().getProperties());
copyEntities.add(new PrePublishStep(fileSet, Maps.<String, String>newHashMap(), deleteStep, stepPriority++));
for (CopyableFile.Builder builder : this.helper.getCopyableFilesFromPaths(diffPathSet.filesToCopy, this.helper.getConfiguration(), Optional.<Partition>absent())) {
CopyableFile fileEntity = builder.fileSet(fileSet).datasetOutputPath(desiredTargetLocation.location.toString()).build();
this.helper.setCopyableFileDatasets(fileEntity);
copyEntities.add(fileEntity);
}
multiTimer.close();
return copyEntities;
}
use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.
the class HivePartitionFileSet method generateCopyEntities.
@Override
protected Collection<CopyEntity> generateCopyEntities() throws IOException {
try (Closer closer = Closer.create()) {
MultiTimingEvent multiTimer = closer.register(new MultiTimingEvent(this.eventSubmitter, "PartitionCopy", true));
int stepPriority = 0;
String fileSet = HiveCopyEntityHelper.gson.toJson(this.partition.getValues());
List<CopyEntity> copyEntities = Lists.newArrayList();
stepPriority = hiveCopyEntityHelper.addSharedSteps(copyEntities, fileSet, stepPriority);
multiTimer.nextStage(HiveCopyEntityHelper.Stages.COMPUTE_TARGETS);
Path targetPath = hiveCopyEntityHelper.getTargetLocation(hiveCopyEntityHelper.getDataset().fs, hiveCopyEntityHelper.getTargetFs(), this.partition.getDataLocation(), Optional.of(this.partition));
Partition targetPartition = getTargetPartition(this.partition, targetPath);
multiTimer.nextStage(HiveCopyEntityHelper.Stages.EXISTING_PARTITION);
if (this.existingTargetPartition.isPresent()) {
hiveCopyEntityHelper.getTargetPartitions().remove(this.partition.getValues());
try {
checkPartitionCompatibility(targetPartition, this.existingTargetPartition.get());
} catch (IOException ioe) {
if (hiveCopyEntityHelper.getExistingEntityPolicy() != HiveCopyEntityHelper.ExistingEntityPolicy.REPLACE_PARTITIONS && hiveCopyEntityHelper.getExistingEntityPolicy() != HiveCopyEntityHelper.ExistingEntityPolicy.REPLACE_TABLE_AND_PARTITIONS) {
log.error("Source and target partitions are not compatible. Aborting copy of partition " + this.partition, ioe);
return Lists.newArrayList();
}
log.warn("Source and target partitions are not compatible. Will override target partition: " + ioe.getMessage());
log.debug("Incompatibility details: ", ioe);
stepPriority = hiveCopyEntityHelper.addPartitionDeregisterSteps(copyEntities, fileSet, stepPriority, hiveCopyEntityHelper.getTargetTable(), this.existingTargetPartition.get());
this.existingTargetPartition = Optional.absent();
}
}
multiTimer.nextStage(HiveCopyEntityHelper.Stages.PARTITION_SKIP_PREDICATE);
if (hiveCopyEntityHelper.getFastPartitionSkip().isPresent() && hiveCopyEntityHelper.getFastPartitionSkip().get().apply(this)) {
log.info(String.format("Skipping copy of partition %s due to fast partition skip predicate.", this.partition.getCompleteName()));
return Lists.newArrayList();
}
HiveSpec partitionHiveSpec = new SimpleHiveSpec.Builder<>(targetPath).withTable(HiveMetaStoreUtils.getHiveTable(hiveCopyEntityHelper.getTargetTable().getTTable())).withPartition(Optional.of(HiveMetaStoreUtils.getHivePartition(targetPartition.getTPartition()))).build();
HiveRegisterStep register = new HiveRegisterStep(hiveCopyEntityHelper.getTargetURI(), partitionHiveSpec, hiveCopyEntityHelper.getHiveRegProps());
copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), register, stepPriority++));
multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_LOCATIONS);
HiveLocationDescriptor sourceLocation = HiveLocationDescriptor.forPartition(this.partition, hiveCopyEntityHelper.getDataset().fs, this.properties);
HiveLocationDescriptor desiredTargetLocation = HiveLocationDescriptor.forPartition(targetPartition, hiveCopyEntityHelper.getTargetFs(), this.properties);
Optional<HiveLocationDescriptor> existingTargetLocation = this.existingTargetPartition.isPresent() ? Optional.of(HiveLocationDescriptor.forPartition(this.existingTargetPartition.get(), hiveCopyEntityHelper.getTargetFs(), this.properties)) : Optional.<HiveLocationDescriptor>absent();
multiTimer.nextStage(HiveCopyEntityHelper.Stages.FULL_PATH_DIFF);
HiveCopyEntityHelper.DiffPathSet diffPathSet = HiveCopyEntityHelper.fullPathDiff(sourceLocation, desiredTargetLocation, existingTargetLocation, Optional.<Partition>absent(), multiTimer, hiveCopyEntityHelper);
multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_DELETE_UNITS);
if (diffPathSet.pathsToDelete.size() > 0) {
DeleteFileCommitStep deleteStep = DeleteFileCommitStep.fromPaths(hiveCopyEntityHelper.getTargetFs(), diffPathSet.pathsToDelete, hiveCopyEntityHelper.getDataset().properties);
copyEntities.add(new PrePublishStep(fileSet, Maps.<String, String>newHashMap(), deleteStep, stepPriority++));
}
multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_COPY_UNITS);
for (CopyableFile.Builder builder : hiveCopyEntityHelper.getCopyableFilesFromPaths(diffPathSet.filesToCopy, hiveCopyEntityHelper.getConfiguration(), Optional.of(this.partition))) {
CopyableFile fileEntity = builder.fileSet(fileSet).checksum(new byte[0]).datasetOutputPath(desiredTargetLocation.location.toString()).build();
this.hiveCopyEntityHelper.setCopyableFileDatasets(fileEntity);
copyEntities.add(fileEntity);
}
log.info("Created {} copy entities for partition {}", copyEntities.size(), this.partition.getCompleteName());
return copyEntities;
}
}
use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.
the class FileAwareInputStreamDataWriter method commit.
/**
* Moves the file from task staging to task output. Each task has its own staging directory but all the tasks share
* the same task output directory.
*
* {@inheritDoc}
*
* @see org.apache.gobblin.writer.DataWriter#commit()
*/
@Override
public void commit() throws IOException {
if (!this.actualProcessedCopyableFile.isPresent()) {
return;
}
CopyableFile copyableFile = this.actualProcessedCopyableFile.get();
Path stagingFilePath = getStagingFilePath(copyableFile);
Path outputFilePath = getOutputFilePath(copyableFile, this.outputDir, copyableFile.getDatasetAndPartition(this.copyableDatasetMetadata));
log.info(String.format("Committing data from %s to %s", stagingFilePath, outputFilePath));
try {
setFilePermissions(copyableFile);
Iterator<OwnerAndPermission> ancestorOwnerAndPermissionIt = copyableFile.getAncestorsOwnerAndPermission() == null ? Iterators.<OwnerAndPermission>emptyIterator() : copyableFile.getAncestorsOwnerAndPermission().iterator();
ensureDirectoryExists(this.fs, outputFilePath.getParent(), ancestorOwnerAndPermissionIt);
if (!this.fs.rename(stagingFilePath, outputFilePath)) {
// target exists
throw new IOException(String.format("Could not commit file %s.", outputFilePath));
}
} catch (IOException ioe) {
// persist file
this.recoveryHelper.persistFile(this.state, copyableFile, stagingFilePath);
throw ioe;
} finally {
try {
this.fs.delete(this.stagingDir, true);
} catch (IOException ioe) {
log.warn("Failed to delete staging path at " + this.stagingDir);
}
}
}
use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.
the class DeletingCopyDataPublisher method deleteFilesOnSource.
private void deleteFilesOnSource(WorkUnitState state) throws IOException {
CopyEntity copyEntity = CopySource.deserializeCopyEntity(state);
if (copyEntity instanceof CopyableFile) {
HadoopUtils.deletePath(this.sourceFs, ((CopyableFile) copyEntity).getOrigin().getPath(), true);
HadoopUtils.deletePath(this.sourceFs, PathUtils.addExtension(((CopyableFile) copyEntity).getOrigin().getPath(), ReadyCopyableFileFilter.READY_EXTENSION), true);
}
}
Aggregations