Search in sources :

Example 1 with TaskPublisher

use of org.apache.gobblin.publisher.TaskPublisher in project incubator-gobblin by apache.

the class Fork method checkDataQuality.

/**
 * Check data quality.
 *
 * @return whether data publishing is successful and data should be committed
 */
private boolean checkDataQuality(Optional<Object> schema) throws Exception {
    if (this.branches > 1) {
        this.forkTaskState.setProp(ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED, this.taskState.getProp(ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED));
        this.forkTaskState.setProp(ConfigurationKeys.EXTRACTOR_ROWS_EXTRACTED, this.taskState.getProp(ConfigurationKeys.EXTRACTOR_ROWS_EXTRACTED));
    }
    String writerRecordsWrittenKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_RECORDS_WRITTEN, this.branches, this.index);
    if (this.writer.isPresent()) {
        this.forkTaskState.setProp(ConfigurationKeys.WRITER_ROWS_WRITTEN, this.writer.get().recordsWritten());
        this.taskState.setProp(writerRecordsWrittenKey, this.writer.get().recordsWritten());
    } else {
        this.forkTaskState.setProp(ConfigurationKeys.WRITER_ROWS_WRITTEN, 0L);
        this.taskState.setProp(writerRecordsWrittenKey, 0L);
    }
    if (schema.isPresent()) {
        this.forkTaskState.setProp(ConfigurationKeys.EXTRACT_SCHEMA, schema.get().toString());
    }
    try {
        // Do task-level quality checking
        TaskLevelPolicyCheckResults taskResults = this.taskContext.getTaskLevelPolicyChecker(this.forkTaskState, this.branches > 1 ? this.index : -1).executePolicies();
        TaskPublisher publisher = this.taskContext.getTaskPublisher(this.forkTaskState, taskResults);
        switch(publisher.canPublish()) {
            case SUCCESS:
                return true;
            case CLEANUP_FAIL:
                this.logger.error("Cleanup failed for task " + this.taskId);
                break;
            case POLICY_TESTS_FAIL:
                this.logger.error("Not all quality checking passed for task " + this.taskId);
                break;
            case COMPONENTS_NOT_FINISHED:
                this.logger.error("Not all components completed for task " + this.taskId);
                break;
            default:
                break;
        }
        return false;
    } catch (Throwable t) {
        this.logger.error("Failed to check task-level data quality", t);
        return false;
    }
}
Also used : TaskPublisher(org.apache.gobblin.publisher.TaskPublisher) TaskLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults)

Example 2 with TaskPublisher

use of org.apache.gobblin.publisher.TaskPublisher in project incubator-gobblin by apache.

the class TaskContinuousTest method testContinuousTaskOneRecord.

/**
 * Test that a streaming task will work correctly when the extractor only produces one record
 * @throws Exception
 */
@Test
public void testContinuousTaskOneRecord() throws Exception {
    ArrayList<Object> recordCollector = new ArrayList<>(100);
    String testRecord = "hello";
    OneRecordExtractor oneRecordExtractor = new OneRecordExtractor(testRecord);
    TaskContext mockTaskContext = getMockTaskContext(recordCollector, oneRecordExtractor);
    // Create a mock TaskPublisher
    TaskPublisher mockTaskPublisher = mock(TaskPublisher.class);
    when(mockTaskPublisher.canPublish()).thenReturn(TaskPublisher.PublisherState.SUCCESS);
    when(mockTaskContext.getTaskPublisher(any(TaskState.class), any(TaskLevelPolicyCheckResults.class))).thenReturn(mockTaskPublisher);
    // Create a mock TaskStateTracker
    TaskStateTracker mockTaskStateTracker = mock(TaskStateTracker.class);
    // Create a TaskExecutor - a real TaskExecutor must be created so a Fork is run in a separate thread
    TaskExecutor taskExecutor = new TaskExecutor(new Properties());
    // Create the Task
    Task task = new Task(mockTaskContext, mockTaskStateTracker, taskExecutor, Optional.<CountDownLatch>absent());
    ScheduledExecutorService taskRunner = new ScheduledThreadPoolExecutor(1, ExecutorsUtils.newThreadFactory(Optional.of(log)));
    taskRunner.execute(task);
    // Let the task run for 2 seconds
    int sleepIterations = 2;
    int currentIteration = 0;
    WatermarkStorage mockWatermarkStorage = mockTaskContext.getWatermarkStorage();
    Map<String, CheckpointableWatermark> externalWatermarkStorage;
    while (currentIteration < sleepIterations) {
        Thread.sleep(1000);
        currentIteration++;
        externalWatermarkStorage = mockWatermarkStorage.getCommittedWatermarks(CheckpointableWatermark.class, ImmutableList.of("default"));
        if (!externalWatermarkStorage.isEmpty()) {
            for (CheckpointableWatermark watermark : externalWatermarkStorage.values()) {
                log.info("Observed committed watermark: {}", watermark);
            }
            log.info("Task progress: {}", task.getProgress());
            // Ensure that watermarks seem reasonable at each step
            Assert.assertTrue(oneRecordExtractor.validateWatermarks(false, externalWatermarkStorage));
        }
    }
    // Let's try to shutdown the task
    task.shutdown();
    log.info("Shutting down task now");
    boolean success = task.awaitShutdown(3000);
    Assert.assertTrue(success, "Task should shutdown in 3 seconds");
    log.info("Task done waiting to shutdown {}", success);
    externalWatermarkStorage = mockWatermarkStorage.getCommittedWatermarks(CheckpointableWatermark.class, ImmutableList.of("0"));
    // Ensure that committed watermarks match exactly the input rows because we shutdown in an orderly manner.
    Assert.assertTrue(oneRecordExtractor.validateWatermarks(true, externalWatermarkStorage));
    // Ensure that the record made it to the writer correctly
    Assert.assertEquals(recordCollector.size(), 1);
    Assert.assertEquals(recordCollector.get(0), testRecord);
    task.commit();
    // Shutdown the executor
    taskRunner.shutdown();
    taskRunner.awaitTermination(100, TimeUnit.MILLISECONDS);
}
Also used : WatermarkStorage(org.apache.gobblin.writer.WatermarkStorage) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) TaskPublisher(org.apache.gobblin.publisher.TaskPublisher) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) ArrayList(java.util.ArrayList) TaskLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults) Properties(java.util.Properties) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) DefaultCheckpointableWatermark(org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark) Test(org.testng.annotations.Test)

Example 3 with TaskPublisher

use of org.apache.gobblin.publisher.TaskPublisher in project incubator-gobblin by apache.

the class TestRecordStream method setupTask.

private Task setupTask(Extractor extractor, DataWriterBuilder writer, List<Converter<?, ?, ?, ?>> converters, List<RecordStreamProcessor<?, ?, ?, ?>> recordStreamProcessors) throws Exception {
    // Create a TaskState
    TaskState taskState = getEmptyTestTaskState("testRetryTaskId");
    taskState.setProp(ConfigurationKeys.TASK_SYNCHRONOUS_EXECUTION_MODEL_KEY, false);
    // Create a mock TaskContext
    TaskContext mockTaskContext = mock(TaskContext.class);
    when(mockTaskContext.getExtractor()).thenReturn(extractor);
    when(mockTaskContext.getForkOperator()).thenReturn(new IdentityForkOperator());
    when(mockTaskContext.getTaskState()).thenReturn(taskState);
    when(mockTaskContext.getConverters()).thenReturn(converters);
    when(mockTaskContext.getRecordStreamProcessors()).thenReturn(recordStreamProcessors);
    when(mockTaskContext.getTaskLevelPolicyChecker(any(TaskState.class), anyInt())).thenReturn(mock(TaskLevelPolicyChecker.class));
    when(mockTaskContext.getRowLevelPolicyChecker()).thenReturn(new RowLevelPolicyChecker(Lists.newArrayList(), "ss", FileSystem.getLocal(new Configuration())));
    when(mockTaskContext.getRowLevelPolicyChecker(anyInt())).thenReturn(new RowLevelPolicyChecker(Lists.newArrayList(), "ss", FileSystem.getLocal(new Configuration())));
    when(mockTaskContext.getDataWriterBuilder(anyInt(), anyInt())).thenReturn(writer);
    // Create a mock TaskPublisher
    TaskPublisher mockTaskPublisher = mock(TaskPublisher.class);
    when(mockTaskPublisher.canPublish()).thenReturn(TaskPublisher.PublisherState.SUCCESS);
    when(mockTaskContext.getTaskPublisher(any(TaskState.class), any(TaskLevelPolicyCheckResults.class))).thenReturn(mockTaskPublisher);
    // Create a mock TaskStateTracker
    TaskStateTracker mockTaskStateTracker = mock(TaskStateTracker.class);
    // Create a TaskExecutor - a real TaskExecutor must be created so a Fork is run in a separate thread
    TaskExecutor taskExecutor = new TaskExecutor(new Properties());
    // Create the Task
    Task realTask = new Task(mockTaskContext, mockTaskStateTracker, taskExecutor, Optional.<CountDownLatch>absent());
    Task task = spy(realTask);
    doNothing().when(task).submitTaskCommittedEvent();
    return task;
}
Also used : IdentityForkOperator(org.apache.gobblin.fork.IdentityForkOperator) TaskPublisher(org.apache.gobblin.publisher.TaskPublisher) Configuration(org.apache.hadoop.conf.Configuration) TaskLevelPolicyChecker(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyChecker) RowLevelPolicyChecker(org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker) TaskLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults) Properties(java.util.Properties)

Example 4 with TaskPublisher

use of org.apache.gobblin.publisher.TaskPublisher in project incubator-gobblin by apache.

the class TaskContinuousTest method getMockTaskContext.

private TaskContext getMockTaskContext(ArrayList<Object> recordCollector, Extractor mockExtractor) throws Exception {
    TaskState taskState = getStreamingTaskState();
    // Create a mock RowLevelPolicyChecker
    RowLevelPolicyChecker mockRowLevelPolicyChecker = new RowLevelPolicyChecker(Lists.newArrayList(), "stateId", FileSystem.getLocal(new Configuration()));
    WatermarkStorage mockWatermarkStorage = new MockWatermarkStorage();
    // Create a mock TaskPublisher
    TaskPublisher mockTaskPublisher = mock(TaskPublisher.class);
    when(mockTaskPublisher.canPublish()).thenReturn(TaskPublisher.PublisherState.SUCCESS);
    // Create a mock TaskContext
    TaskContext mockTaskContext = mock(TaskContext.class);
    when(mockTaskContext.getTaskMetrics()).thenReturn(TaskMetrics.get(taskState));
    when(mockTaskContext.getExtractor()).thenReturn(mockExtractor);
    when(mockTaskContext.getRawSourceExtractor()).thenReturn(mockExtractor);
    when(mockTaskContext.getWatermarkStorage()).thenReturn(mockWatermarkStorage);
    when(mockTaskContext.getForkOperator()).thenReturn(new IdentityForkOperator());
    when(mockTaskContext.getTaskState()).thenReturn(taskState);
    when(mockTaskContext.getTaskPublisher(any(TaskState.class), any(TaskLevelPolicyCheckResults.class))).thenReturn(mockTaskPublisher);
    when(mockTaskContext.getRowLevelPolicyChecker()).thenReturn(mockRowLevelPolicyChecker);
    when(mockTaskContext.getRowLevelPolicyChecker(anyInt())).thenReturn(mockRowLevelPolicyChecker);
    when(mockTaskContext.getTaskLevelPolicyChecker(any(TaskState.class), anyInt())).thenReturn(mock(TaskLevelPolicyChecker.class));
    when(mockTaskContext.getDataWriterBuilder(anyInt(), anyInt())).thenReturn(new TestStreamingDataWriterBuilder(recordCollector));
    return mockTaskContext;
}
Also used : IdentityForkOperator(org.apache.gobblin.fork.IdentityForkOperator) WatermarkStorage(org.apache.gobblin.writer.WatermarkStorage) TaskPublisher(org.apache.gobblin.publisher.TaskPublisher) Configuration(org.apache.hadoop.conf.Configuration) TaskLevelPolicyChecker(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyChecker) RowLevelPolicyChecker(org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker) TaskLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults)

Example 5 with TaskPublisher

use of org.apache.gobblin.publisher.TaskPublisher in project incubator-gobblin by apache.

the class TaskTest method getMockTaskContext.

private TaskContext getMockTaskContext(TaskState taskState, Extractor mockExtractor, ArrayList<ArrayList<Object>> writerCollectors, ForkOperator mockForkOperator) throws Exception {
    int numForks = writerCollectors.size();
    // Create a mock RowLevelPolicyChecker
    RowLevelPolicyChecker mockRowLevelPolicyChecker = spy(new RowLevelPolicyChecker(Lists.newArrayList(), "ss", FileSystem.getLocal(new Configuration())));
    when(mockRowLevelPolicyChecker.executePolicies(any(Object.class), any(RowLevelPolicyCheckResults.class))).thenReturn(true);
    when(mockRowLevelPolicyChecker.getFinalState()).thenReturn(new State());
    // Create a mock TaskPublisher
    TaskPublisher mockTaskPublisher = mock(TaskPublisher.class);
    when(mockTaskPublisher.canPublish()).thenReturn(TaskPublisher.PublisherState.SUCCESS);
    // Create a mock TaskContext
    TaskContext mockTaskContext = mock(TaskContext.class);
    when(mockTaskContext.getExtractor()).thenReturn(mockExtractor);
    when(mockTaskContext.getRawSourceExtractor()).thenReturn(mockExtractor);
    when(mockTaskContext.getForkOperator()).thenReturn(mockForkOperator);
    when(mockTaskContext.getTaskState()).thenReturn(taskState);
    when(mockTaskContext.getTaskPublisher(any(TaskState.class), any(TaskLevelPolicyCheckResults.class))).thenReturn(mockTaskPublisher);
    when(mockTaskContext.getRowLevelPolicyChecker()).thenReturn(mockRowLevelPolicyChecker);
    when(mockTaskContext.getRowLevelPolicyChecker(anyInt())).thenReturn(mockRowLevelPolicyChecker);
    when(mockTaskContext.getTaskLevelPolicyChecker(any(TaskState.class), anyInt())).thenReturn(mock(TaskLevelPolicyChecker.class));
    for (int i = 0; i < numForks; ++i) {
        when(mockTaskContext.getDataWriterBuilder(numForks, i)).thenReturn(new RecordCollectingWriterBuilder(writerCollectors.get(i)));
    }
    return mockTaskContext;
}
Also used : TaskPublisher(org.apache.gobblin.publisher.TaskPublisher) Configuration(org.apache.hadoop.conf.Configuration) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) TaskLevelPolicyChecker(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyChecker) RowLevelPolicyChecker(org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker) TaskLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults) RowLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.row.RowLevelPolicyCheckResults)

Aggregations

TaskPublisher (org.apache.gobblin.publisher.TaskPublisher)7 TaskLevelPolicyCheckResults (org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults)7 RowLevelPolicyChecker (org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker)5 TaskLevelPolicyChecker (org.apache.gobblin.qualitychecker.task.TaskLevelPolicyChecker)5 Configuration (org.apache.hadoop.conf.Configuration)5 Properties (java.util.Properties)4 IdentityForkOperator (org.apache.gobblin.fork.IdentityForkOperator)4 Test (org.testng.annotations.Test)3 WatermarkStorage (org.apache.gobblin.writer.WatermarkStorage)2 ArrayList (java.util.ArrayList)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 ScheduledThreadPoolExecutor (java.util.concurrent.ScheduledThreadPoolExecutor)1 BasicAckableForTesting (org.apache.gobblin.ack.BasicAckableForTesting)1 State (org.apache.gobblin.configuration.State)1 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)1 RowLevelPolicyCheckResults (org.apache.gobblin.qualitychecker.row.RowLevelPolicyCheckResults)1 CheckpointableWatermark (org.apache.gobblin.source.extractor.CheckpointableWatermark)1 DefaultCheckpointableWatermark (org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark)1 RecordEnvelope (org.apache.gobblin.stream.RecordEnvelope)1 StreamEntity (org.apache.gobblin.stream.StreamEntity)1