Search in sources :

Example 11 with Dataset

use of org.apache.gobblin.dataset.Dataset in project incubator-gobblin by apache.

the class CompactionSource method prioritize.

private List<Dataset> prioritize(List<Dataset> datasets, State state) {
    double maxPool = state.getPropAsDouble(MRCompactor.COMPACTION_DATASETS_MAX_COUNT, MRCompactor.DEFUALT_COMPACTION_DATASETS_MAX_COUNT);
    ResourcePool pool = ResourcePool.builder().maxResource(SimpleDatasetRequest.SIMPLE_DATASET_COUNT_DIMENSION, maxPool).build();
    Iterator<Dataset> newList = Iterators.transform(this.allocator.allocateRequests(datasets.stream().map(SimpleDatasetRequestor::new).iterator(), pool), (input) -> input.getDataset());
    return Lists.newArrayList(newList);
}
Also used : Dataset(org.apache.gobblin.dataset.Dataset) ResourcePool(org.apache.gobblin.util.request_allocation.ResourcePool)

Example 12 with Dataset

use of org.apache.gobblin.dataset.Dataset in project incubator-gobblin by apache.

the class DatasetCleaner method clean.

/**
 * Perform the cleanup of old / deprecated dataset versions.
 * @throws IOException
 */
public void clean() throws IOException {
    List<Dataset> dataSets = this.datasetFinder.findDatasets();
    this.finishCleanSignal = Optional.of(new CountDownLatch(dataSets.size()));
    for (final Dataset dataset : dataSets) {
        ListenableFuture<Void> future = this.service.submit(new Callable<Void>() {

            @Override
            public Void call() throws Exception {
                if (dataset instanceof CleanableDataset) {
                    ((CleanableDataset) dataset).clean();
                }
                return null;
            }
        });
        Futures.addCallback(future, new FutureCallback<Void>() {

            @Override
            public void onFailure(Throwable throwable) {
                DatasetCleaner.this.finishCleanSignal.get().countDown();
                LOG.warn("Exception caught when cleaning " + dataset.datasetURN() + ".", throwable);
                DatasetCleaner.this.throwables.add(throwable);
                Instrumented.markMeter(DatasetCleaner.this.datasetsCleanFailureMeter);
                DatasetCleaner.this.eventSubmitter.submit(RetentionEvents.CleanFailed.EVENT_NAME, ImmutableMap.of(RetentionEvents.CleanFailed.FAILURE_CONTEXT_METADATA_KEY, ExceptionUtils.getFullStackTrace(throwable), RetentionEvents.DATASET_URN_METADATA_KEY, dataset.datasetURN()));
            }

            @Override
            public void onSuccess(Void arg0) {
                DatasetCleaner.this.finishCleanSignal.get().countDown();
                LOG.info("Successfully cleaned: " + dataset.datasetURN());
                Instrumented.markMeter(DatasetCleaner.this.datasetsCleanSuccessMeter);
            }
        });
    }
}
Also used : CleanableDataset(org.apache.gobblin.data.management.retention.dataset.CleanableDataset) Dataset(org.apache.gobblin.dataset.Dataset) CleanableDataset(org.apache.gobblin.data.management.retention.dataset.CleanableDataset) CountDownLatch(java.util.concurrent.CountDownLatch) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException)

Example 13 with Dataset

use of org.apache.gobblin.dataset.Dataset in project incubator-gobblin by apache.

the class LoopingDatasetFinderSource method getWorkunitStream.

@Override
public WorkUnitStream getWorkunitStream(SourceState state) {
    try {
        int maxWorkUnits = state.getPropAsInt(MAX_WORK_UNITS_PER_RUN_KEY, MAX_WORK_UNITS_PER_RUN);
        List<WorkUnitState> previousWorkUnitStates = state.getPreviousWorkUnitStates();
        Optional<WorkUnitState> maxWorkUnit;
        try {
            maxWorkUnit = previousWorkUnitStates.stream().reduce((wu1, wu2) -> {
                int wu1Ordinal = wu1.getPropAsInt(WORK_UNIT_ORDINAL);
                int wu2Ordinal = wu2.getPropAsInt(WORK_UNIT_ORDINAL);
                return wu1Ordinal > wu2Ordinal ? wu1 : wu2;
            });
        } catch (NumberFormatException nfe) {
            throw new RuntimeException("Work units in state store are corrupted! Missing or malformed " + WORK_UNIT_ORDINAL);
        }
        String previousDatasetUrnWatermark = null;
        String previousPartitionUrnWatermark = null;
        if (maxWorkUnit.isPresent() && !maxWorkUnit.get().getPropAsBoolean(END_OF_DATASETS_KEY, false)) {
            previousDatasetUrnWatermark = maxWorkUnit.get().getProp(DATASET_URN);
            previousPartitionUrnWatermark = maxWorkUnit.get().getProp(PARTITION_URN);
        }
        IterableDatasetFinder datasetsFinder = createDatasetsFinder(state);
        Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(Spliterator.SORTED, this.lexicographicalComparator);
        datasetStream = sortStreamLexicographically(datasetStream);
        return new BasicWorkUnitStream.Builder(new DeepIterator(datasetStream.iterator(), previousDatasetUrnWatermark, previousPartitionUrnWatermark, maxWorkUnits)).setFiniteStream(true).build();
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
}
Also used : WorkUnitStream(org.apache.gobblin.source.workunit.WorkUnitStream) Iterator(java.util.Iterator) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) AbstractIterator(com.google.common.collect.AbstractIterator) IOException(java.io.IOException) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) PeekingIterator(com.google.common.collect.PeekingIterator) Iterators(com.google.common.collect.Iterators) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) NoopTask(org.apache.gobblin.runtime.task.NoopTask) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) Stream(java.util.stream.Stream) Lists(com.google.common.collect.Lists) BasicWorkUnitStream(org.apache.gobblin.source.workunit.BasicWorkUnitStream) SourceState(org.apache.gobblin.configuration.SourceState) Optional(java.util.Optional) URNIdentified(org.apache.gobblin.dataset.URNIdentified) StreamSupport(java.util.stream.StreamSupport) Spliterator(java.util.Spliterator) Dataset(org.apache.gobblin.dataset.Dataset) URNLexicographicalComparator(org.apache.gobblin.dataset.comparators.URNLexicographicalComparator) Nullable(javax.annotation.Nullable) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) Dataset(org.apache.gobblin.dataset.Dataset) IOException(java.io.IOException)

Example 14 with Dataset

use of org.apache.gobblin.dataset.Dataset in project incubator-gobblin by apache.

the class ComplianceRestoreJob method run.

public void run() throws IOException {
    Preconditions.checkNotNull(this.finder, "Dataset finder class is not set");
    List<Dataset> datasets = this.finder.findDatasets();
    this.finishCleanSignal = Optional.of(new CountDownLatch(datasets.size()));
    for (final Dataset dataset : datasets) {
        ListenableFuture<Void> future = this.service.submit(new Callable<Void>() {

            @Override
            public Void call() throws Exception {
                if (dataset instanceof RestorableDataset) {
                    log.info("Trying to restore");
                    ((RestorableDataset) dataset).restore();
                } else {
                    log.warn("Not an instance of " + RestorableDataset.class + " Dataset won't be restored " + dataset.datasetURN());
                }
                return null;
            }
        });
        Futures.addCallback(future, new FutureCallback<Void>() {

            @Override
            public void onSuccess(@Nullable Void result) {
                ComplianceRestoreJob.this.finishCleanSignal.get().countDown();
                log.info("Successfully restored: " + dataset.datasetURN());
            }

            @Override
            public void onFailure(Throwable t) {
                ComplianceRestoreJob.this.finishCleanSignal.get().countDown();
                log.warn("Exception caught when restoring " + dataset.datasetURN() + ".", t);
                ComplianceRestoreJob.this.throwables.add(t);
                ComplianceRestoreJob.this.eventSubmitter.submit(ComplianceEvents.Restore.FAILED_EVENT_NAME, ImmutableMap.of(ComplianceEvents.FAILURE_CONTEXT_METADATA_KEY, ExceptionUtils.getFullStackTrace(t), ComplianceEvents.DATASET_URN_METADATA_KEY, dataset.datasetURN()));
            }
        });
    }
}
Also used : Dataset(org.apache.gobblin.dataset.Dataset) CountDownLatch(java.util.concurrent.CountDownLatch) TException(org.apache.thrift.TException) IOException(java.io.IOException)

Example 15 with Dataset

use of org.apache.gobblin.dataset.Dataset in project incubator-gobblin by apache.

the class ComplianceValidationJob method run.

public void run() throws IOException {
    Preconditions.checkNotNull(this.finder, "Dataset finder class is not set");
    List<Dataset> datasets = this.finder.findDatasets();
    this.finishCleanSignal = Optional.of(new CountDownLatch(datasets.size()));
    for (final Dataset dataset : datasets) {
        ListenableFuture<Void> future = this.service.submit(new Callable<Void>() {

            @Override
            public Void call() throws Exception {
                if (dataset instanceof ValidatableDataset) {
                    ((ValidatableDataset) dataset).validate();
                } else {
                    log.warn("Not an instance of " + ValidatableDataset.class + " Dataset won't be validated " + dataset.datasetURN());
                }
                return null;
            }
        });
        Futures.addCallback(future, new FutureCallback<Void>() {

            @Override
            public void onSuccess(@Nullable Void result) {
                ComplianceValidationJob.this.finishCleanSignal.get().countDown();
                log.info("Successfully validated: " + dataset.datasetURN());
            }

            @Override
            public void onFailure(Throwable t) {
                ComplianceValidationJob.this.finishCleanSignal.get().countDown();
                log.warn("Exception caught when validating " + dataset.datasetURN() + ".", t);
                ComplianceValidationJob.this.throwables.add(t);
                ComplianceValidationJob.this.eventSubmitter.submit(ComplianceEvents.Validation.FAILED_EVENT_NAME, ImmutableMap.of(ComplianceEvents.FAILURE_CONTEXT_METADATA_KEY, ExceptionUtils.getFullStackTrace(t), ComplianceEvents.DATASET_URN_METADATA_KEY, dataset.datasetURN()));
            }
        });
    }
}
Also used : Dataset(org.apache.gobblin.dataset.Dataset) CountDownLatch(java.util.concurrent.CountDownLatch) TException(org.apache.thrift.TException) IOException(java.io.IOException)

Aggregations

Dataset (org.apache.gobblin.dataset.Dataset)15 IOException (java.io.IOException)7 SourceState (org.apache.gobblin.configuration.SourceState)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 PartitionableDataset (org.apache.gobblin.dataset.PartitionableDataset)6 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6 CountDownLatch (java.util.concurrent.CountDownLatch)4 SimpleDatasetForTesting (org.apache.gobblin.dataset.test.SimpleDatasetForTesting)4 SimpleDatasetPartitionForTesting (org.apache.gobblin.dataset.test.SimpleDatasetPartitionForTesting)4 SimplePartitionableDatasetForTesting (org.apache.gobblin.dataset.test.SimplePartitionableDatasetForTesting)4 StaticDatasetsFinderForTesting (org.apache.gobblin.dataset.test.StaticDatasetsFinderForTesting)4 Test (org.testng.annotations.Test)4 Config (com.typesafe.config.Config)3 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)3 CleanableDataset (org.apache.gobblin.data.management.retention.dataset.CleanableDataset)3 BasicWorkUnitStream (org.apache.gobblin.source.workunit.BasicWorkUnitStream)3 TException (org.apache.thrift.TException)3 List (java.util.List)2 Callable (java.util.concurrent.Callable)2