Search in sources :

Example 1 with InputFormatProvider

use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.

the class ExternalDatasets method makeTrackable.

/**
 * If the input is an external source then an external dataset is created for tracking purpose and returned.
 * If the input is a regular dataset or a stream then it is already trackable, hence same input is returned.
 *
 * @param admin {@link Admin} used to create external dataset
 * @param input input to be tracked
 * @return an external dataset if input is an external source, otherwise the same input that is passed-in is returned
 */
public static Input makeTrackable(Admin admin, Input input) {
    // If input is not an external source, return the same input as it can be tracked by itself.
    if (!(input instanceof Input.InputFormatProviderInput)) {
        return input;
    }
    // Input is an external source, create an external dataset so that it can be tracked.
    String inputName = input.getName();
    InputFormatProvider inputFormatProvider = ((Input.InputFormatProviderInput) input).getInputFormatProvider();
    Map<String, String> inputFormatConfiguration = inputFormatProvider.getInputFormatConfiguration();
    // this too can be tracked by itself without creating an external dataset
    if (inputFormatProvider instanceof Dataset) {
        return input;
    }
    try {
        // Create an external dataset for the input format for lineage tracking
        Map<String, String> arguments = new HashMap<>();
        arguments.put("input.format.class", inputFormatProvider.getInputFormatClassName());
        arguments.putAll(inputFormatConfiguration);
        if (!admin.datasetExists(inputName)) {
            // Note: the dataset properties are the same as the arguments since we cannot identify them separately
            // since they are mixed up in a single configuration object (CDAP-5674)
            // Also, the properties of the external dataset created will contain runtime arguments for the same reason.
            admin.createDataset(inputName, EXTERNAL_DATASET_TYPE, DatasetProperties.of(arguments));
        } else {
            // Check if the external dataset name clashes with an existing CDAP Dataset
            String datasetType = admin.getDatasetType(inputName);
            if (!EXTERNAL_DATASET_TYPE.equals(datasetType)) {
                throw new IllegalArgumentException("An external source cannot have the same name as an existing CDAP Dataset instance " + inputName);
            }
        }
        return Input.ofDataset(inputName, Collections.unmodifiableMap(arguments)).alias(input.getAlias());
    } catch (DatasetManagementException e) {
        throw Throwables.propagate(e);
    }
}
Also used : DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider) HashMap(java.util.HashMap) Dataset(io.cdap.cdap.api.dataset.Dataset)

Example 2 with InputFormatProvider

use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.

the class MapReduceBatchContext method setInput.

@Override
public void setInput(Input input) {
    Input wrapped = CALLER.callUnchecked(() -> {
        Input trackableInput = input;
        if (isPreviewEnabled && input instanceof Input.InputFormatProviderInput) {
            InputFormatProvider inputFormatProvider = ((Input.InputFormatProviderInput) input).getInputFormatProvider();
            LimitingInputFormatProvider wrapper = new LimitingInputFormatProvider(inputFormatProvider, getMaxPreviewRecords());
            trackableInput = Input.of(input.getName(), wrapper).alias(input.getAlias());
        }
        trackableInput = ExternalDatasets.makeTrackable(mrContext.getAdmin(), suffixInput(trackableInput));
        mrContext.addInput(trackableInput);
        return trackableInput;
    });
    inputNames.add(wrapped.getAlias());
}
Also used : Input(io.cdap.cdap.api.data.batch.Input) InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider) LimitingInputFormatProvider(io.cdap.cdap.etl.batch.preview.LimitingInputFormatProvider) LimitingInputFormatProvider(io.cdap.cdap.etl.batch.preview.LimitingInputFormatProvider)

Example 3 with InputFormatProvider

use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.

the class MapReduceRuntimeService method setInputsIfNeeded.

/**
 * Sets the configurations used for inputs.
 * Multiple mappers could be defined, so we first check that their output types are consistent.
 *
 * @return the TypeToken for one of the mappers (doesn't matter which one, since we check that all of their output
 * key/value types are consistent. Returns null if the mapper class was not configured directly on the job and the
 * job's mapper class is to be used.
 * @throws IllegalArgumentException if any of the configured mapper output types are inconsistent.
 */
@Nullable
private TypeToken<Mapper> setInputsIfNeeded(Job job) throws IOException, ClassNotFoundException {
    Class<? extends Mapper> jobMapperClass = job.getMapperClass();
    Class<? extends Mapper> firstMapperClass = null;
    Map.Entry<Class, Class> firstMapperOutputTypes = null;
    for (Map.Entry<String, MapperInput> mapperInputEntry : context.getMapperInputs().entrySet()) {
        MapperInput mapperInput = mapperInputEntry.getValue();
        InputFormatProvider provider = mapperInput.getInputFormatProvider();
        Map<String, String> inputFormatConfiguration = mapperInput.getInputFormatConfiguration();
        // default to what is configured on the job, if user didn't specify a mapper for an input
        Class<? extends Mapper> mapperClass = mapperInput.getMapper() == null ? jobMapperClass : mapperInput.getMapper();
        // check output key/value type consistency, except for the first input
        if (firstMapperClass == null) {
            firstMapperClass = mapperClass;
            firstMapperOutputTypes = getMapperOutputKeyValueTypes(mapperClass);
        } else {
            assertConsistentTypes(firstMapperClass, firstMapperOutputTypes, mapperClass);
        }
        MultipleInputs.addInput(job, mapperInputEntry.getKey(), mapperInput.getInputFormatClassName(), inputFormatConfiguration, mapperClass);
    }
    // if firstMapperClass == jobMapperClass, return null if the user didn't configure the mapper class explicitly
    if (firstMapperClass == null || firstMapperClass == jobMapperClass) {
        return resolveClass(job.getConfiguration(), MRJobConfig.MAP_CLASS_ATTR, Mapper.class);
    }
    return resolveClass(firstMapperClass, Mapper.class);
}
Also used : MapperInput(io.cdap.cdap.internal.app.runtime.batch.dataset.input.MapperInput) InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider) Map(java.util.Map) HashMap(java.util.HashMap) AbstractMap(java.util.AbstractMap) Nullable(javax.annotation.Nullable)

Example 4 with InputFormatProvider

use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.

the class SparkBatchSourceFactory method createInputRDD.

@SuppressWarnings("unchecked")
private <K, V> JavaPairRDD<K, V> createInputRDD(JavaSparkExecutionContext sec, JavaSparkContext jsc, String inputName, Class<K> keyClass, Class<V> valueClass) {
    if (inputFormatProviders.containsKey(inputName)) {
        InputFormatProvider inputFormatProvider = inputFormatProviders.get(inputName);
        ClassLoader classLoader = Objects.firstNonNull(currentThread().getContextClassLoader(), getClass().getClassLoader());
        return RDDUtils.readUsingInputFormat(jsc, inputFormatProvider, classLoader, keyClass, valueClass);
    }
    if (datasetInfos.containsKey(inputName)) {
        DatasetInfo datasetInfo = datasetInfos.get(inputName);
        return sec.fromDataset(datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
    }
    // which make sure one and only one of those source type will be specified.
    throw new IllegalStateException("Unknown source type");
}
Also used : BasicInputFormatProvider(io.cdap.cdap.etl.batch.BasicInputFormatProvider) InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider)

Example 5 with InputFormatProvider

use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.

the class FileConnector method getInputFormatProvider.

@Override
public InputFormatProvider getInputFormatProvider(ConnectorContext context, SampleRequest request) throws IOException {
    Job job = Job.getInstance();
    File file = new File(request.getPath());
    FileInputFormat.addInputPath(job, new Path(file.toURI()));
    return new InputFormatProvider() {

        @Override
        public String getInputFormatClassName() {
            return TextInputFormat.class.getName();
        }

        @Override
        public Map<String, String> getInputFormatConfiguration() {
            return Collections.singletonMap(FileInputFormat.INPUT_DIR, job.getConfiguration().get(FileInputFormat.INPUT_DIR));
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider) Job(org.apache.hadoop.mapreduce.Job) File(java.io.File)

Aggregations

InputFormatProvider (io.cdap.cdap.api.data.batch.InputFormatProvider)7 LimitingInputFormatProvider (io.cdap.cdap.etl.batch.preview.LimitingInputFormatProvider)3 HashMap (java.util.HashMap)3 Input (io.cdap.cdap.api.data.batch.Input)2 BasicInputFormatProvider (io.cdap.cdap.etl.batch.BasicInputFormatProvider)2 Job (org.apache.hadoop.mapreduce.Job)2 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)1 Dataset (io.cdap.cdap.api.dataset.Dataset)1 DatasetManagementException (io.cdap.cdap.api.dataset.DatasetManagementException)1 LimitingInputFormat (io.cdap.cdap.etl.batch.preview.LimitingInputFormat)1 MapperInput (io.cdap.cdap.internal.app.runtime.batch.dataset.input.MapperInput)1 File (java.io.File)1 IOException (java.io.IOException)1 AbstractMap (java.util.AbstractMap)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 Nullable (javax.annotation.Nullable)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 InputSplit (org.apache.hadoop.mapreduce.InputSplit)1