Search in sources :

Example 6 with InputFormatProvider

use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by cdapio.

the class MapReduceRuntimeService method setInputsIfNeeded.

/**
 * Sets the configurations used for inputs.
 * Multiple mappers could be defined, so we first check that their output types are consistent.
 *
 * @return the TypeToken for one of the mappers (doesn't matter which one, since we check that all of their output
 * key/value types are consistent. Returns null if the mapper class was not configured directly on the job and the
 * job's mapper class is to be used.
 * @throws IllegalArgumentException if any of the configured mapper output types are inconsistent.
 */
@Nullable
private TypeToken<Mapper> setInputsIfNeeded(Job job) throws IOException, ClassNotFoundException {
    Class<? extends Mapper> jobMapperClass = job.getMapperClass();
    Class<? extends Mapper> firstMapperClass = null;
    Map.Entry<Class, Class> firstMapperOutputTypes = null;
    for (Map.Entry<String, MapperInput> mapperInputEntry : context.getMapperInputs().entrySet()) {
        MapperInput mapperInput = mapperInputEntry.getValue();
        InputFormatProvider provider = mapperInput.getInputFormatProvider();
        Map<String, String> inputFormatConfiguration = mapperInput.getInputFormatConfiguration();
        // default to what is configured on the job, if user didn't specify a mapper for an input
        Class<? extends Mapper> mapperClass = mapperInput.getMapper() == null ? jobMapperClass : mapperInput.getMapper();
        // check output key/value type consistency, except for the first input
        if (firstMapperClass == null) {
            firstMapperClass = mapperClass;
            firstMapperOutputTypes = getMapperOutputKeyValueTypes(mapperClass);
        } else {
            assertConsistentTypes(firstMapperClass, firstMapperOutputTypes, mapperClass);
        }
        MultipleInputs.addInput(job, mapperInputEntry.getKey(), mapperInput.getInputFormatClassName(), inputFormatConfiguration, mapperClass);
    }
    // if firstMapperClass == jobMapperClass, return null if the user didn't configure the mapper class explicitly
    if (firstMapperClass == null || firstMapperClass == jobMapperClass) {
        return resolveClass(job.getConfiguration(), MRJobConfig.MAP_CLASS_ATTR, Mapper.class);
    }
    return resolveClass(firstMapperClass, Mapper.class);
}
Also used : MapperInput(io.cdap.cdap.internal.app.runtime.batch.dataset.input.MapperInput) InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider) Map(java.util.Map) HashMap(java.util.HashMap) AbstractMap(java.util.AbstractMap) Nullable(javax.annotation.Nullable)

Example 7 with InputFormatProvider

use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by cdapio.

the class FileConnector method getInputFormatProvider.

@Override
public InputFormatProvider getInputFormatProvider(ConnectorContext context, SampleRequest request) throws IOException {
    Job job = Job.getInstance();
    File file = new File(request.getPath());
    FileInputFormat.addInputPath(job, new Path(file.toURI()));
    return new InputFormatProvider() {

        @Override
        public String getInputFormatClassName() {
            return TextInputFormat.class.getName();
        }

        @Override
        public Map<String, String> getInputFormatConfiguration() {
            return Collections.singletonMap(FileInputFormat.INPUT_DIR, job.getConfiguration().get(FileInputFormat.INPUT_DIR));
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider) Job(org.apache.hadoop.mapreduce.Job) File(java.io.File)

Example 8 with InputFormatProvider

use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by cdapio.

the class ExternalDatasets method makeTrackable.

/**
 * If the input is an external source then an external dataset is created for tracking purpose and returned.
 * If the input is a regular dataset or a stream then it is already trackable, hence same input is returned.
 *
 * @param admin {@link Admin} used to create external dataset
 * @param input input to be tracked
 * @return an external dataset if input is an external source, otherwise the same input that is passed-in is returned
 */
public static Input makeTrackable(Admin admin, Input input) {
    // If input is not an external source, return the same input as it can be tracked by itself.
    if (!(input instanceof Input.InputFormatProviderInput)) {
        return input;
    }
    // Input is an external source, create an external dataset so that it can be tracked.
    String inputName = input.getName();
    InputFormatProvider inputFormatProvider = ((Input.InputFormatProviderInput) input).getInputFormatProvider();
    Map<String, String> inputFormatConfiguration = inputFormatProvider.getInputFormatConfiguration();
    // this too can be tracked by itself without creating an external dataset
    if (inputFormatProvider instanceof Dataset) {
        return input;
    }
    try {
        // Create an external dataset for the input format for lineage tracking
        Map<String, String> arguments = new HashMap<>();
        arguments.put("input.format.class", inputFormatProvider.getInputFormatClassName());
        arguments.putAll(inputFormatConfiguration);
        if (!admin.datasetExists(inputName)) {
            // Note: the dataset properties are the same as the arguments since we cannot identify them separately
            // since they are mixed up in a single configuration object (CDAP-5674)
            // Also, the properties of the external dataset created will contain runtime arguments for the same reason.
            admin.createDataset(inputName, EXTERNAL_DATASET_TYPE, DatasetProperties.of(arguments));
        } else {
            // Check if the external dataset name clashes with an existing CDAP Dataset
            String datasetType = admin.getDatasetType(inputName);
            if (!EXTERNAL_DATASET_TYPE.equals(datasetType)) {
                throw new IllegalArgumentException("An external source cannot have the same name as an existing CDAP Dataset instance " + inputName);
            }
        }
        return Input.ofDataset(inputName, Collections.unmodifiableMap(arguments)).alias(input.getAlias());
    } catch (DatasetManagementException e) {
        throw Throwables.propagate(e);
    }
}
Also used : DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider) HashMap(java.util.HashMap) Dataset(io.cdap.cdap.api.dataset.Dataset)

Example 9 with InputFormatProvider

use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by cdapio.

the class SparkBatchSourceFactory method createInputRDD.

@SuppressWarnings("unchecked")
private <K, V> JavaPairRDD<K, V> createInputRDD(JavaSparkExecutionContext sec, JavaSparkContext jsc, String inputName, Class<K> keyClass, Class<V> valueClass) {
    if (inputFormatProviders.containsKey(inputName)) {
        InputFormatProvider inputFormatProvider = inputFormatProviders.get(inputName);
        ClassLoader classLoader = Objects.firstNonNull(currentThread().getContextClassLoader(), getClass().getClassLoader());
        return RDDUtils.readUsingInputFormat(jsc, inputFormatProvider, classLoader, keyClass, valueClass);
    }
    if (datasetInfos.containsKey(inputName)) {
        DatasetInfo datasetInfo = datasetInfos.get(inputName);
        return sec.fromDataset(datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
    }
    // which make sure one and only one of those source type will be specified.
    throw new IllegalStateException("Unknown source type");
}
Also used : BasicInputFormatProvider(io.cdap.cdap.etl.batch.BasicInputFormatProvider) InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider)

Example 10 with InputFormatProvider

use of io.cdap.cdap.api.data.batch.InputFormatProvider in project hydrator-plugins by cdapio.

the class SnapshotFileBatchSource method configurePipeline.

@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
    String inputFormatName = getInputFormatName();
    InputFormatProvider inputFormatProvider = pipelineConfigurer.usePlugin(ValidatingInputFormat.PLUGIN_TYPE, inputFormatName, FORMAT_PLUGIN_ID, config.getProperties());
    if (inputFormatProvider == null) {
        throw new IllegalArgumentException(String.format("Could not find the '%s' input format plugin. " + "Please ensure the '%s' format plugin is installed.", inputFormatName, inputFormatName));
    }
    // get input format configuration to give the output format plugin a chance to validate it's config
    // and fail pipeline deployment if it is invalid
    inputFormatProvider.getInputFormatConfiguration();
    if (!config.containsMacro("name") && !config.containsMacro("basePath") && !config.containsMacro("fileProperties")) {
        pipelineConfigurer.createDataset(config.getName(), PartitionedFileSet.class, createProperties(inputFormatProvider));
    }
    pipelineConfigurer.getStageConfigurer().setOutputSchema(config.getSchema());
}
Also used : InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider)

Aggregations

InputFormatProvider (io.cdap.cdap.api.data.batch.InputFormatProvider)19 HashMap (java.util.HashMap)8 LimitingInputFormatProvider (io.cdap.cdap.etl.batch.preview.LimitingInputFormatProvider)6 Job (org.apache.hadoop.mapreduce.Job)5 Input (io.cdap.cdap.api.data.batch.Input)4 BasicInputFormatProvider (io.cdap.cdap.etl.batch.BasicInputFormatProvider)4 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)3 Schema (io.cdap.cdap.api.data.schema.Schema)3 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 Map (java.util.Map)3 Path (org.apache.hadoop.fs.Path)3 Dataset (io.cdap.cdap.api.dataset.Dataset)2 DatasetManagementException (io.cdap.cdap.api.dataset.DatasetManagementException)2 DatasetProperties (io.cdap.cdap.api.dataset.DatasetProperties)2 TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)2 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)2 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)2 LimitingInputFormat (io.cdap.cdap.etl.batch.preview.LimitingInputFormat)2 MapperInput (io.cdap.cdap.internal.app.runtime.batch.dataset.input.MapperInput)2