use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by cdapio.
the class MapReduceRuntimeService method setInputsIfNeeded.
/**
* Sets the configurations used for inputs.
* Multiple mappers could be defined, so we first check that their output types are consistent.
*
* @return the TypeToken for one of the mappers (doesn't matter which one, since we check that all of their output
* key/value types are consistent. Returns null if the mapper class was not configured directly on the job and the
* job's mapper class is to be used.
* @throws IllegalArgumentException if any of the configured mapper output types are inconsistent.
*/
@Nullable
private TypeToken<Mapper> setInputsIfNeeded(Job job) throws IOException, ClassNotFoundException {
Class<? extends Mapper> jobMapperClass = job.getMapperClass();
Class<? extends Mapper> firstMapperClass = null;
Map.Entry<Class, Class> firstMapperOutputTypes = null;
for (Map.Entry<String, MapperInput> mapperInputEntry : context.getMapperInputs().entrySet()) {
MapperInput mapperInput = mapperInputEntry.getValue();
InputFormatProvider provider = mapperInput.getInputFormatProvider();
Map<String, String> inputFormatConfiguration = mapperInput.getInputFormatConfiguration();
// default to what is configured on the job, if user didn't specify a mapper for an input
Class<? extends Mapper> mapperClass = mapperInput.getMapper() == null ? jobMapperClass : mapperInput.getMapper();
// check output key/value type consistency, except for the first input
if (firstMapperClass == null) {
firstMapperClass = mapperClass;
firstMapperOutputTypes = getMapperOutputKeyValueTypes(mapperClass);
} else {
assertConsistentTypes(firstMapperClass, firstMapperOutputTypes, mapperClass);
}
MultipleInputs.addInput(job, mapperInputEntry.getKey(), mapperInput.getInputFormatClassName(), inputFormatConfiguration, mapperClass);
}
// if firstMapperClass == jobMapperClass, return null if the user didn't configure the mapper class explicitly
if (firstMapperClass == null || firstMapperClass == jobMapperClass) {
return resolveClass(job.getConfiguration(), MRJobConfig.MAP_CLASS_ATTR, Mapper.class);
}
return resolveClass(firstMapperClass, Mapper.class);
}
use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by cdapio.
the class FileConnector method getInputFormatProvider.
@Override
public InputFormatProvider getInputFormatProvider(ConnectorContext context, SampleRequest request) throws IOException {
Job job = Job.getInstance();
File file = new File(request.getPath());
FileInputFormat.addInputPath(job, new Path(file.toURI()));
return new InputFormatProvider() {
@Override
public String getInputFormatClassName() {
return TextInputFormat.class.getName();
}
@Override
public Map<String, String> getInputFormatConfiguration() {
return Collections.singletonMap(FileInputFormat.INPUT_DIR, job.getConfiguration().get(FileInputFormat.INPUT_DIR));
}
};
}
use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by cdapio.
the class ExternalDatasets method makeTrackable.
/**
* If the input is an external source then an external dataset is created for tracking purpose and returned.
* If the input is a regular dataset or a stream then it is already trackable, hence same input is returned.
*
* @param admin {@link Admin} used to create external dataset
* @param input input to be tracked
* @return an external dataset if input is an external source, otherwise the same input that is passed-in is returned
*/
public static Input makeTrackable(Admin admin, Input input) {
// If input is not an external source, return the same input as it can be tracked by itself.
if (!(input instanceof Input.InputFormatProviderInput)) {
return input;
}
// Input is an external source, create an external dataset so that it can be tracked.
String inputName = input.getName();
InputFormatProvider inputFormatProvider = ((Input.InputFormatProviderInput) input).getInputFormatProvider();
Map<String, String> inputFormatConfiguration = inputFormatProvider.getInputFormatConfiguration();
// this too can be tracked by itself without creating an external dataset
if (inputFormatProvider instanceof Dataset) {
return input;
}
try {
// Create an external dataset for the input format for lineage tracking
Map<String, String> arguments = new HashMap<>();
arguments.put("input.format.class", inputFormatProvider.getInputFormatClassName());
arguments.putAll(inputFormatConfiguration);
if (!admin.datasetExists(inputName)) {
// Note: the dataset properties are the same as the arguments since we cannot identify them separately
// since they are mixed up in a single configuration object (CDAP-5674)
// Also, the properties of the external dataset created will contain runtime arguments for the same reason.
admin.createDataset(inputName, EXTERNAL_DATASET_TYPE, DatasetProperties.of(arguments));
} else {
// Check if the external dataset name clashes with an existing CDAP Dataset
String datasetType = admin.getDatasetType(inputName);
if (!EXTERNAL_DATASET_TYPE.equals(datasetType)) {
throw new IllegalArgumentException("An external source cannot have the same name as an existing CDAP Dataset instance " + inputName);
}
}
return Input.ofDataset(inputName, Collections.unmodifiableMap(arguments)).alias(input.getAlias());
} catch (DatasetManagementException e) {
throw Throwables.propagate(e);
}
}
use of io.cdap.cdap.api.data.batch.InputFormatProvider in project cdap by cdapio.
the class SparkBatchSourceFactory method createInputRDD.
@SuppressWarnings("unchecked")
private <K, V> JavaPairRDD<K, V> createInputRDD(JavaSparkExecutionContext sec, JavaSparkContext jsc, String inputName, Class<K> keyClass, Class<V> valueClass) {
if (inputFormatProviders.containsKey(inputName)) {
InputFormatProvider inputFormatProvider = inputFormatProviders.get(inputName);
ClassLoader classLoader = Objects.firstNonNull(currentThread().getContextClassLoader(), getClass().getClassLoader());
return RDDUtils.readUsingInputFormat(jsc, inputFormatProvider, classLoader, keyClass, valueClass);
}
if (datasetInfos.containsKey(inputName)) {
DatasetInfo datasetInfo = datasetInfos.get(inputName);
return sec.fromDataset(datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
}
// which make sure one and only one of those source type will be specified.
throw new IllegalStateException("Unknown source type");
}
use of io.cdap.cdap.api.data.batch.InputFormatProvider in project hydrator-plugins by cdapio.
the class SnapshotFileBatchSource method configurePipeline.
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
String inputFormatName = getInputFormatName();
InputFormatProvider inputFormatProvider = pipelineConfigurer.usePlugin(ValidatingInputFormat.PLUGIN_TYPE, inputFormatName, FORMAT_PLUGIN_ID, config.getProperties());
if (inputFormatProvider == null) {
throw new IllegalArgumentException(String.format("Could not find the '%s' input format plugin. " + "Please ensure the '%s' format plugin is installed.", inputFormatName, inputFormatName));
}
// get input format configuration to give the output format plugin a chance to validate it's config
// and fail pipeline deployment if it is invalid
inputFormatProvider.getInputFormatConfiguration();
if (!config.containsMacro("name") && !config.containsMacro("basePath") && !config.containsMacro("fileProperties")) {
pipelineConfigurer.createDataset(config.getName(), PartitionedFileSet.class, createProperties(inputFormatProvider));
}
pipelineConfigurer.getStageConfigurer().setOutputSchema(config.getSchema());
}
Aggregations