use of co.cask.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.
the class ExternalDatasets method makeTrackable.
/**
* If the input is an external source then an external dataset is created for tracking purpose and returned.
* If the input is a regular dataset or a stream then it is already trackable, hence same input is returned.
*
* @param admin {@link Admin} used to create external dataset
* @param input input to be tracked
* @return an external dataset if input is an external source, otherwise the same input that is passed-in is returned
*/
public static Input makeTrackable(Admin admin, Input input) {
// If input is not an external source, return the same input as it can be tracked by itself.
if (!(input instanceof Input.InputFormatProviderInput)) {
return input;
}
// Input is an external source, create an external dataset so that it can be tracked.
String inputName = input.getName();
InputFormatProvider inputFormatProvider = ((Input.InputFormatProviderInput) input).getInputFormatProvider();
Map<String, String> inputFormatConfiguration = inputFormatProvider.getInputFormatConfiguration();
// this too can be tracked by itself without creating an external dataset
if (inputFormatProvider instanceof Dataset) {
return input;
}
try {
// Create an external dataset for the input format for lineage tracking
Map<String, String> arguments = new HashMap<>();
arguments.put("input.format.class", inputFormatProvider.getInputFormatClassName());
arguments.putAll(inputFormatConfiguration);
if (!admin.datasetExists(inputName)) {
// Note: the dataset properties are the same as the arguments since we cannot identify them separately
// since they are mixed up in a single configuration object (CDAP-5674)
// Also, the properties of the external dataset created will contain runtime arguments for the same reason.
admin.createDataset(inputName, EXTERNAL_DATASET_TYPE, DatasetProperties.of(arguments));
} else {
// Check if the external dataset name clashes with an existing CDAP Dataset
String datasetType = admin.getDatasetType(inputName);
if (!EXTERNAL_DATASET_TYPE.equals(datasetType)) {
throw new IllegalArgumentException("An external source cannot have the same name as an existing CDAP Dataset instance " + inputName);
}
}
return Input.ofDataset(inputName, Collections.unmodifiableMap(arguments)).alias(input.getAlias());
} catch (DatasetManagementException e) {
throw Throwables.propagate(e);
}
}
use of co.cask.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.
the class SparkBatchSourceFactory method createInputRDD.
@SuppressWarnings("unchecked")
private <K, V> JavaPairRDD<K, V> createInputRDD(JavaSparkExecutionContext sec, JavaSparkContext jsc, String inputName, Class<K> keyClass, Class<V> valueClass) {
if (streams.containsKey(inputName)) {
Input.StreamInput streamInput = streams.get(inputName);
FormatSpecification formatSpec = streamInput.getBodyFormatSpec();
if (formatSpec != null) {
return (JavaPairRDD<K, V>) sec.fromStream(streamInput.getName(), formatSpec, streamInput.getStartTime(), streamInput.getEndTime(), StructuredRecord.class);
}
String decoderType = streamInput.getDecoderType();
if (decoderType == null) {
return (JavaPairRDD<K, V>) sec.fromStream(streamInput.getName(), streamInput.getStartTime(), streamInput.getEndTime(), valueClass);
} else {
try {
Class<StreamEventDecoder<K, V>> decoderClass = (Class<StreamEventDecoder<K, V>>) Thread.currentThread().getContextClassLoader().loadClass(decoderType);
return sec.fromStream(streamInput.getName(), streamInput.getStartTime(), streamInput.getEndTime(), decoderClass, keyClass, valueClass);
} catch (Exception e) {
throw Throwables.propagate(e);
}
}
}
if (inputFormatProviders.containsKey(inputName)) {
InputFormatProvider inputFormatProvider = inputFormatProviders.get(inputName);
Configuration hConf = new Configuration();
hConf.clear();
for (Map.Entry<String, String> entry : inputFormatProvider.getInputFormatConfiguration().entrySet()) {
hConf.set(entry.getKey(), entry.getValue());
}
ClassLoader classLoader = Objects.firstNonNull(currentThread().getContextClassLoader(), getClass().getClassLoader());
try {
@SuppressWarnings("unchecked") Class<InputFormat> inputFormatClass = (Class<InputFormat>) classLoader.loadClass(inputFormatProvider.getInputFormatClassName());
return jsc.newAPIHadoopRDD(hConf, inputFormatClass, keyClass, valueClass);
} catch (ClassNotFoundException e) {
throw Throwables.propagate(e);
}
}
if (datasetInfos.containsKey(inputName)) {
DatasetInfo datasetInfo = datasetInfos.get(inputName);
return sec.fromDataset(datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
}
// which make sure one and only one of those source type will be specified.
throw new IllegalStateException("Unknown source type");
}
use of co.cask.cdap.api.data.batch.InputFormatProvider in project cdap by caskdata.
the class MapReduceRuntimeService method setInputsIfNeeded.
/**
* Sets the configurations used for inputs.
* Multiple mappers could be defined, so we first check that their output types are consistent.
*
* @return the TypeToken for one of the mappers (doesn't matter which one, since we check that all of their output
* key/value types are consistent. Returns null if the mapper class was not configured directly on the job and the
* job's mapper class is to be used.
* @throws IllegalArgumentException if any of the configured mapper output types are inconsistent.
*/
@Nullable
private TypeToken<Mapper> setInputsIfNeeded(Job job) throws IOException, ClassNotFoundException {
Class<? extends Mapper> jobMapperClass = job.getMapperClass();
Class<? extends Mapper> firstMapperClass = null;
Map.Entry<Class, Class> firstMapperOutputTypes = null;
for (Map.Entry<String, MapperInput> mapperInputEntry : context.getMapperInputs().entrySet()) {
MapperInput mapperInput = mapperInputEntry.getValue();
InputFormatProvider provider = mapperInput.getInputFormatProvider();
Map<String, String> inputFormatConfiguration = mapperInput.getInputFormatConfiguration();
// default to what is configured on the job, if user didn't specify a mapper for an input
Class<? extends Mapper> mapperClass = mapperInput.getMapper() == null ? jobMapperClass : mapperInput.getMapper();
// check output key/value type consistency, except for the first input
if (firstMapperClass == null) {
firstMapperClass = mapperClass;
firstMapperOutputTypes = getMapperOutputKeyValueTypes(mapperClass);
} else {
assertConsistentTypes(firstMapperClass, firstMapperOutputTypes, mapperClass);
}
// A bit hacky for stream.
if (provider instanceof StreamInputFormatProvider) {
// pass in mapperInput.getMapper() instead of mapperClass, because mapperClass defaults to the Identity Mapper
StreamInputFormatProvider inputFormatProvider = (StreamInputFormatProvider) provider;
setDecoderForStream(inputFormatProvider, job, inputFormatConfiguration, mapperInput.getMapper());
// after lineage/usage registry since we want to track the intent of reading from there.
try {
authorizationEnforcer.enforce(inputFormatProvider.getStreamId(), authenticationContext.getPrincipal(), Action.READ);
} catch (Exception e) {
Throwables.propagateIfPossible(e, IOException.class);
throw new IOException(e);
}
}
MultipleInputs.addInput(job, mapperInputEntry.getKey(), mapperInput.getInputFormatClassName(), inputFormatConfiguration, mapperClass);
}
// if firstMapperClass == jobMapperClass, return null if the user didn't configure the mapper class explicitly
if (firstMapperClass == null || firstMapperClass == jobMapperClass) {
return resolveClass(job.getConfiguration(), MRJobConfig.MAP_CLASS_ATTR, Mapper.class);
}
return resolveClass(firstMapperClass, Mapper.class);
}
Aggregations