use of org.apache.gobblin.dataset.Dataset in project incubator-gobblin by apache.
the class DatasetFinderSource method createWorkUnitStream.
private Stream<WorkUnit> createWorkUnitStream(SourceState state) throws IOException {
IterableDatasetFinder datasetsFinder = createDatasetsFinder(state);
Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(0, null);
if (this.drilldownIntoPartitions) {
return datasetStream.flatMap(dataset -> {
if (dataset instanceof PartitionableDataset) {
try {
return (Stream<PartitionableDataset.DatasetPartition>) ((PartitionableDataset) dataset).getPartitions(0, null);
} catch (IOException ioe) {
log.error("Failed to get partitions for dataset " + dataset.getUrn());
return Stream.empty();
}
} else {
return Stream.of(new DatasetWrapper(dataset));
}
}).map(this::workUnitForPartitionInternal);
} else {
return datasetStream.map(this::workUnitForDataset);
}
}
use of org.apache.gobblin.dataset.Dataset in project incubator-gobblin by apache.
the class ConfigBasedCleanabledDatasetFinder method findDatasetsCallable.
protected Callable<Void> findDatasetsCallable(final ConfigClient confClient, final URI u, final Properties p, Optional<List<String>> blacklistURNs, final Collection<Dataset> datasets) {
return new Callable<Void>() {
@Override
public Void call() throws Exception {
// Process each {@link Config}, find dataset and add those into the datasets
Config c = confClient.getConfig(u);
Dataset datasetForConfig = new ConfigurableCleanableDataset(fileSystem, p, new Path(c.getString(DATASET_PATH)), c, log);
datasets.add(datasetForConfig);
return null;
}
};
}
use of org.apache.gobblin.dataset.Dataset in project incubator-gobblin by apache.
the class ComplianceRetentionJob method run.
public void run() throws IOException {
// Dropping empty tables
for (HiveDataset dataset : this.tablesToDrop) {
log.info("Dropping table: " + dataset.getTable().getCompleteName());
executeDropTableQuery(dataset, this.properties);
}
Preconditions.checkNotNull(this.finder, "Dataset finder class is not set");
List<Dataset> datasets = this.finder.findDatasets();
this.finishCleanSignal = Optional.of(new CountDownLatch(datasets.size()));
for (final Dataset dataset : datasets) {
ListenableFuture<Void> future = this.service.submit(new Callable<Void>() {
@Override
public Void call() throws Exception {
if (dataset instanceof CleanableDataset) {
((CleanableDataset) dataset).clean();
} else {
log.warn("Not an instance of " + CleanableDataset.class + " Dataset won't be cleaned " + dataset.datasetURN());
}
return null;
}
});
Futures.addCallback(future, new FutureCallback<Void>() {
@Override
public void onSuccess(@Nullable Void result) {
ComplianceRetentionJob.this.finishCleanSignal.get().countDown();
log.info("Successfully cleaned: " + dataset.datasetURN());
}
@Override
public void onFailure(Throwable t) {
ComplianceRetentionJob.this.finishCleanSignal.get().countDown();
log.warn("Exception caught when cleaning " + dataset.datasetURN() + ".", t);
ComplianceRetentionJob.this.throwables.add(t);
ComplianceRetentionJob.this.eventSubmitter.submit(ComplianceEvents.Retention.FAILED_EVENT_NAME, ImmutableMap.of(ComplianceEvents.FAILURE_CONTEXT_METADATA_KEY, ExceptionUtils.getFullStackTrace(t), ComplianceEvents.DATASET_URN_METADATA_KEY, dataset.datasetURN()));
}
});
}
}
use of org.apache.gobblin.dataset.Dataset in project incubator-gobblin by apache.
the class CompactionSource method getWorkunitStream.
@Override
public WorkUnitStream getWorkunitStream(SourceState state) {
try {
fs = getSourceFileSystem(state);
state.setProp(COMPACTION_INIT_TIME, DateTimeUtils.currentTimeMillis());
suite = CompactionSuiteUtils.getCompactionSuiteFactory(state).createSuite(state);
initRequestAllocator(state);
initJobDir(state);
copyJarDependencies(state);
DatasetsFinder finder = DatasetUtils.instantiateDatasetFinder(state.getProperties(), getSourceFileSystem(state), DefaultFileSystemGlobFinder.class.getName());
List<Dataset> datasets = finder.findDatasets();
CompactionWorkUnitIterator workUnitIterator = new CompactionWorkUnitIterator();
// Spawn a single thread to create work units
new Thread(new SingleWorkUnitGeneratorService(state, prioritize(datasets, state), workUnitIterator), "SingleWorkUnitGeneratorService").start();
return new BasicWorkUnitStream.Builder(workUnitIterator).build();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.apache.gobblin.dataset.Dataset in project incubator-gobblin by apache.
the class ConfigBasedCopyableDatasetFinder method findDatasetsCallable.
protected Callable<Void> findDatasetsCallable(final ConfigClient confClient, final URI u, final Properties p, Optional<List<String>> blacklistPatterns, final Collection<Dataset> datasets) {
return new Callable<Void>() {
@Override
public Void call() throws Exception {
// Process each {@link Config}, find dataset and add those into the datasets
Config c = confClient.getConfig(u);
List<Dataset> datasetForConfig = new ConfigBasedMultiDatasets(c, p, blacklistPatterns).getConfigBasedDatasetList();
datasets.addAll(datasetForConfig);
return null;
}
};
}
Aggregations