Search in sources :

Example 21 with InputSplit

use of org.apache.flink.core.io.InputSplit in project flink by apache.

the class DataSourceTask method invoke.

@Override
public void invoke() throws Exception {
    // --------------------------------------------------------------------
    // Initialize
    // --------------------------------------------------------------------
    initInputFormat();
    LOG.debug(getLogString("Start registering input and output"));
    try {
        initOutputs(getUserCodeClassLoader());
    } catch (Exception ex) {
        throw new RuntimeException("The initialization of the DataSource's outputs caused an error: " + ex.getMessage(), ex);
    }
    LOG.debug(getLogString("Finished registering input and output"));
    // --------------------------------------------------------------------
    // Invoke
    // --------------------------------------------------------------------
    LOG.debug(getLogString("Starting data source operator"));
    RuntimeContext ctx = createRuntimeContext();
    Counter completedSplitsCounter = ctx.getMetricGroup().counter("numSplitsProcessed");
    ((OperatorMetricGroup) ctx.getMetricGroup()).getIOMetricGroup().reuseInputMetricsForTask();
    Counter numRecordsOut = ((OperatorMetricGroup) ctx.getMetricGroup()).getIOMetricGroup().getNumRecordsOutCounter();
    if (this.config.getNumberOfChainedStubs() == 0) {
        ((OperatorMetricGroup) ctx.getMetricGroup()).getIOMetricGroup().reuseOutputMetricsForTask();
    }
    if (RichInputFormat.class.isAssignableFrom(this.format.getClass())) {
        ((RichInputFormat) this.format).setRuntimeContext(ctx);
        LOG.debug(getLogString("Rich Source detected. Initializing runtime context."));
        ((RichInputFormat) this.format).openInputFormat();
        LOG.debug(getLogString("Rich Source detected. Opening the InputFormat."));
    }
    ExecutionConfig executionConfig = getExecutionConfig();
    boolean objectReuseEnabled = executionConfig.isObjectReuseEnabled();
    LOG.debug("DataSourceTask object reuse: " + (objectReuseEnabled ? "ENABLED" : "DISABLED") + ".");
    final TypeSerializer<OT> serializer = this.serializerFactory.getSerializer();
    try {
        // start all chained tasks
        BatchTask.openChainedTasks(this.chainedTasks, this);
        // get input splits to read
        final Iterator<InputSplit> splitIterator = getInputSplits();
        // for each assigned input split
        while (!this.taskCanceled && splitIterator.hasNext()) {
            // get start and end
            final InputSplit split = splitIterator.next();
            LOG.debug(getLogString("Opening input split " + split.toString()));
            final InputFormat<OT, InputSplit> format = this.format;
            // open input format
            format.open(split);
            LOG.debug(getLogString("Starting to read input from split " + split.toString()));
            try {
                final Collector<OT> output = new CountingCollector<>(this.output, numRecordsOut);
                if (objectReuseEnabled) {
                    OT reuse = serializer.createInstance();
                    // as long as there is data to read
                    while (!this.taskCanceled && !format.reachedEnd()) {
                        OT returned;
                        if ((returned = format.nextRecord(reuse)) != null) {
                            output.collect(returned);
                        }
                    }
                } else {
                    // as long as there is data to read
                    while (!this.taskCanceled && !format.reachedEnd()) {
                        OT returned;
                        if ((returned = format.nextRecord(serializer.createInstance())) != null) {
                            output.collect(returned);
                        }
                    }
                }
                if (LOG.isDebugEnabled() && !this.taskCanceled) {
                    LOG.debug(getLogString("Closing input split " + split.toString()));
                }
            } finally {
                // close. We close here such that a regular close throwing an exception marks a task as failed.
                format.close();
            }
            completedSplitsCounter.inc();
        }
        // end for all input splits
        // close the collector. if it is a chaining task collector, it will close its chained tasks
        this.output.close();
        // close all chained tasks letting them report failure
        BatchTask.closeChainedTasks(this.chainedTasks, this);
    } catch (Exception ex) {
        // close the input, but do not report any exceptions, since we already have another root cause
        try {
            this.format.close();
        } catch (Throwable ignored) {
        }
        BatchTask.cancelChainedTasks(this.chainedTasks);
        ex = ExceptionInChainedStubException.exceptionUnwrap(ex);
        if (ex instanceof CancelTaskException) {
            // forward canceling exception
            throw ex;
        } else if (!this.taskCanceled) {
            // drop exception, if the task was canceled
            BatchTask.logAndThrowException(ex, this);
        }
    } finally {
        BatchTask.clearWriters(eventualOutputs);
        // --------------------------------------------------------------------
        if (this.format != null && RichInputFormat.class.isAssignableFrom(this.format.getClass())) {
            ((RichInputFormat) this.format).closeInputFormat();
            LOG.debug(getLogString("Rich Source detected. Closing the InputFormat."));
        }
    }
    if (!this.taskCanceled) {
        LOG.debug(getLogString("Finished data source operator"));
    } else {
        LOG.debug(getLogString("Data source operator cancelled"));
    }
}
Also used : RichInputFormat(org.apache.flink.api.common.io.RichInputFormat) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) ExceptionInChainedStubException(org.apache.flink.runtime.operators.chaining.ExceptionInChainedStubException) NoSuchElementException(java.util.NoSuchElementException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) InputSplitProviderException(org.apache.flink.runtime.jobgraph.tasks.InputSplitProviderException) CountingCollector(org.apache.flink.runtime.operators.util.metrics.CountingCollector) Counter(org.apache.flink.metrics.Counter) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) RuntimeContext(org.apache.flink.api.common.functions.RuntimeContext) InputSplit(org.apache.flink.core.io.InputSplit)

Aggregations

InputSplit (org.apache.flink.core.io.InputSplit)21 Test (org.junit.Test)12 HashSet (java.util.HashSet)7 LocatableInputSplit (org.apache.flink.core.io.LocatableInputSplit)6 LocatableInputSplitAssigner (org.apache.flink.api.common.io.LocatableInputSplitAssigner)5 InputSplitProviderException (org.apache.flink.runtime.jobgraph.tasks.InputSplitProviderException)4 ArrayList (java.util.ArrayList)3 NoSuchElementException (java.util.NoSuchElementException)3 GenericInputSplit (org.apache.flink.core.io.GenericInputSplit)3 IOException (java.io.IOException)2 Iterator (java.util.Iterator)2 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)2 JobID (org.apache.flink.api.common.JobID)2 DefaultInputSplitAssigner (org.apache.flink.api.common.io.DefaultInputSplitAssigner)2 InputFormat (org.apache.flink.api.common.io.InputFormat)2 RichInputFormat (org.apache.flink.api.common.io.RichInputFormat)2 GenericParameterValuesProvider (org.apache.flink.api.java.io.jdbc.split.GenericParameterValuesProvider)2 ParameterValuesProvider (org.apache.flink.api.java.io.jdbc.split.ParameterValuesProvider)2 InputSplitAssigner (org.apache.flink.core.io.InputSplitAssigner)2 JobException (org.apache.flink.runtime.JobException)2