Search in sources :

Example 1 with InputFormatProvider

use of co.cask.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.

the class ExternalDatasets method makeTrackable.

/**
   * If the input is an external source then an external dataset is created for tracking purpose and returned.
   * If the input is a regular dataset or a stream then it is already trackable, hence same input is returned.
   *
   * @param admin {@link Admin} used to create external dataset
   * @param input input to be tracked
   * @return an external dataset if input is an external source, otherwise the same input that is passed-in is returned
   */
public static Input makeTrackable(Admin admin, Input input) {
    // If input is not an external source, return the same input as it can be tracked by itself.
    if (!(input instanceof Input.InputFormatProviderInput)) {
        return input;
    }
    // Input is an external source, create an external dataset so that it can be tracked.
    String inputName = input.getName();
    InputFormatProvider inputFormatProvider = ((Input.InputFormatProviderInput) input).getInputFormatProvider();
    Map<String, String> inputFormatConfiguration = inputFormatProvider.getInputFormatConfiguration();
    // this too can be tracked by itself without creating an external dataset
    if (inputFormatProvider instanceof Dataset) {
        return input;
    }
    try {
        // Create an external dataset for the input format for lineage tracking
        Map<String, String> arguments = new HashMap<>();
        arguments.put("input.format.class", inputFormatProvider.getInputFormatClassName());
        arguments.putAll(inputFormatConfiguration);
        if (!admin.datasetExists(inputName)) {
            // Note: the dataset properties are the same as the arguments since we cannot identify them separately
            // since they are mixed up in a single configuration object (CDAP-5674)
            // Also, the properties of the external dataset created will contain runtime arguments for the same reason.
            admin.createDataset(inputName, EXTERNAL_DATASET_TYPE, DatasetProperties.of(arguments));
        } else {
            // Check if the external dataset name clashes with an existing CDAP Dataset
            String datasetType = admin.getDatasetType(inputName);
            if (!EXTERNAL_DATASET_TYPE.equals(datasetType)) {
                throw new IllegalArgumentException("An external source cannot have the same name as an existing CDAP Dataset instance " + inputName);
            }
        }
        return Input.ofDataset(inputName, Collections.unmodifiableMap(arguments)).alias(input.getAlias());
    } catch (DatasetManagementException e) {
        throw Throwables.propagate(e);
    }
}
Also used : DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) InputFormatProvider(co.cask.cdap.api.data.batch.InputFormatProvider) HashMap(java.util.HashMap) Dataset(co.cask.cdap.api.dataset.Dataset)

Example 2 with InputFormatProvider

use of co.cask.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.

the class SparkBatchSourceFactory method createInputRDD.

@SuppressWarnings("unchecked")
private <K, V> JavaPairRDD<K, V> createInputRDD(JavaSparkExecutionContext sec, JavaSparkContext jsc, String inputName, Class<K> keyClass, Class<V> valueClass) {
    if (streams.containsKey(inputName)) {
        Input.StreamInput streamInput = streams.get(inputName);
        FormatSpecification formatSpec = streamInput.getBodyFormatSpec();
        if (formatSpec != null) {
            return (JavaPairRDD<K, V>) sec.fromStream(streamInput.getName(), formatSpec, streamInput.getStartTime(), streamInput.getEndTime(), StructuredRecord.class);
        }
        String decoderType = streamInput.getDecoderType();
        if (decoderType == null) {
            return (JavaPairRDD<K, V>) sec.fromStream(streamInput.getName(), streamInput.getStartTime(), streamInput.getEndTime(), valueClass);
        } else {
            try {
                Class<StreamEventDecoder<K, V>> decoderClass = (Class<StreamEventDecoder<K, V>>) Thread.currentThread().getContextClassLoader().loadClass(decoderType);
                return sec.fromStream(streamInput.getName(), streamInput.getStartTime(), streamInput.getEndTime(), decoderClass, keyClass, valueClass);
            } catch (Exception e) {
                throw Throwables.propagate(e);
            }
        }
    }
    if (inputFormatProviders.containsKey(inputName)) {
        InputFormatProvider inputFormatProvider = inputFormatProviders.get(inputName);
        Configuration hConf = new Configuration();
        hConf.clear();
        for (Map.Entry<String, String> entry : inputFormatProvider.getInputFormatConfiguration().entrySet()) {
            hConf.set(entry.getKey(), entry.getValue());
        }
        ClassLoader classLoader = Objects.firstNonNull(currentThread().getContextClassLoader(), getClass().getClassLoader());
        try {
            @SuppressWarnings("unchecked") Class<InputFormat> inputFormatClass = (Class<InputFormat>) classLoader.loadClass(inputFormatProvider.getInputFormatClassName());
            return jsc.newAPIHadoopRDD(hConf, inputFormatClass, keyClass, valueClass);
        } catch (ClassNotFoundException e) {
            throw Throwables.propagate(e);
        }
    }
    if (datasetInfos.containsKey(inputName)) {
        DatasetInfo datasetInfo = datasetInfos.get(inputName);
        return sec.fromDataset(datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
    }
    // which make sure one and only one of those source type will be specified.
    throw new IllegalStateException("Unknown source type");
}
Also used : InputFormatProvider(co.cask.cdap.api.data.batch.InputFormatProvider) Configuration(org.apache.hadoop.conf.Configuration) FormatSpecification(co.cask.cdap.api.data.format.FormatSpecification) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) StreamEventDecoder(co.cask.cdap.api.stream.StreamEventDecoder) Input(co.cask.cdap.api.data.batch.Input) InputFormat(org.apache.hadoop.mapreduce.InputFormat) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with InputFormatProvider

use of co.cask.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.

the class MapReduceRuntimeService method setInputsIfNeeded.

/**
   * Sets the configurations used for inputs.
   * Multiple mappers could be defined, so we first check that their output types are consistent.
   *
   * @return the TypeToken for one of the mappers (doesn't matter which one, since we check that all of their output
   * key/value types are consistent. Returns null if the mapper class was not configured directly on the job and the
   * job's mapper class is to be used.
   * @throws IllegalArgumentException if any of the configured mapper output types are inconsistent.
   */
@Nullable
private TypeToken<Mapper> setInputsIfNeeded(Job job) throws IOException, ClassNotFoundException {
    Class<? extends Mapper> jobMapperClass = job.getMapperClass();
    Class<? extends Mapper> firstMapperClass = null;
    Map.Entry<Class, Class> firstMapperOutputTypes = null;
    for (Map.Entry<String, MapperInput> mapperInputEntry : context.getMapperInputs().entrySet()) {
        MapperInput mapperInput = mapperInputEntry.getValue();
        InputFormatProvider provider = mapperInput.getInputFormatProvider();
        Map<String, String> inputFormatConfiguration = mapperInput.getInputFormatConfiguration();
        // default to what is configured on the job, if user didn't specify a mapper for an input
        Class<? extends Mapper> mapperClass = mapperInput.getMapper() == null ? jobMapperClass : mapperInput.getMapper();
        // check output key/value type consistency, except for the first input
        if (firstMapperClass == null) {
            firstMapperClass = mapperClass;
            firstMapperOutputTypes = getMapperOutputKeyValueTypes(mapperClass);
        } else {
            assertConsistentTypes(firstMapperClass, firstMapperOutputTypes, mapperClass);
        }
        // A bit hacky for stream.
        if (provider instanceof StreamInputFormatProvider) {
            // pass in mapperInput.getMapper() instead of mapperClass, because mapperClass defaults to the Identity Mapper
            StreamInputFormatProvider inputFormatProvider = (StreamInputFormatProvider) provider;
            setDecoderForStream(inputFormatProvider, job, inputFormatConfiguration, mapperInput.getMapper());
            // after lineage/usage registry since we want to track the intent of reading from there.
            try {
                authorizationEnforcer.enforce(inputFormatProvider.getStreamId(), authenticationContext.getPrincipal(), Action.READ);
            } catch (Exception e) {
                Throwables.propagateIfPossible(e, IOException.class);
                throw new IOException(e);
            }
        }
        MultipleInputs.addInput(job, mapperInputEntry.getKey(), mapperInput.getInputFormatClassName(), inputFormatConfiguration, mapperClass);
    }
    // if firstMapperClass == jobMapperClass, return null if the user didn't configure the mapper class explicitly
    if (firstMapperClass == null || firstMapperClass == jobMapperClass) {
        return resolveClass(job.getConfiguration(), MRJobConfig.MAP_CLASS_ATTR, Mapper.class);
    }
    return resolveClass(firstMapperClass, Mapper.class);
}
Also used : MapperInput(co.cask.cdap.internal.app.runtime.batch.dataset.input.MapperInput) InputFormatProvider(co.cask.cdap.api.data.batch.InputFormatProvider) StreamInputFormatProvider(co.cask.cdap.internal.app.runtime.batch.stream.StreamInputFormatProvider) StreamInputFormatProvider(co.cask.cdap.internal.app.runtime.batch.stream.StreamInputFormatProvider) IOException(java.io.IOException) Map(java.util.Map) HashMap(java.util.HashMap) AbstractMap(java.util.AbstractMap) ProvisionException(com.google.inject.ProvisionException) IOException(java.io.IOException) TransactionFailureException(org.apache.tephra.TransactionFailureException) URISyntaxException(java.net.URISyntaxException) TransactionConflictException(org.apache.tephra.TransactionConflictException) Nullable(javax.annotation.Nullable)

Aggregations

InputFormatProvider (co.cask.cdap.api.data.batch.InputFormatProvider)3 HashMap (java.util.HashMap)3 Map (java.util.Map)2 Input (co.cask.cdap.api.data.batch.Input)1 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)1 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)1 Dataset (co.cask.cdap.api.dataset.Dataset)1 DatasetManagementException (co.cask.cdap.api.dataset.DatasetManagementException)1 StreamEventDecoder (co.cask.cdap.api.stream.StreamEventDecoder)1 MapperInput (co.cask.cdap.internal.app.runtime.batch.dataset.input.MapperInput)1 StreamInputFormatProvider (co.cask.cdap.internal.app.runtime.batch.stream.StreamInputFormatProvider)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ProvisionException (com.google.inject.ProvisionException)1 IOException (java.io.IOException)1 URISyntaxException (java.net.URISyntaxException)1 AbstractMap (java.util.AbstractMap)1 Nullable (javax.annotation.Nullable)1 Configuration (org.apache.hadoop.conf.Configuration)1 InputFormat (org.apache.hadoop.mapreduce.InputFormat)1 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)1