use of io.cdap.cdap.etl.batch.preview.LimitingInputFormat in project cdap by caskdata.
the class LimitingConnector method sample.
@Override
public List<StructuredRecord> sample(ConnectorContext context, SampleRequest request) throws IOException {
InputFormatProvider inputFormatProvider = batchConnector.getInputFormatProvider(context, request);
// use limiting format to read from the input format
Map<String, String> configs = LimitingInputFormatProvider.getConfiguration(inputFormatProvider, request.getLimit());
Configuration hConf = new Configuration();
hConf.setClassLoader(pluginConfigurer.createClassLoader());
configs.forEach(hConf::set);
Job job = Job.getInstance(hConf);
job.setJobID(new JobID("sample", 0));
LimitingInputFormat<?, ?> inputFormat = new LimitingInputFormat<>();
List<InputSplit> splits;
try {
splits = inputFormat.getSplits(job);
} catch (InterruptedException e) {
throw new IOException(String.format("Unable to get the splits from the input format %s", inputFormatProvider.getInputFormatClassName()));
}
List<StructuredRecord> sample = new ArrayList<>();
// limiting format only has 1 split
InputSplit split = splits.get(0);
TaskID taskId = new TaskID(job.getJobID(), TaskType.MAP, 0);
TaskAttemptContext taskContext = new TaskAttemptContextImpl(hConf, new TaskAttemptID(taskId, 0));
// create record reader to read the results
try (RecordReader<?, ?> reader = inputFormat.createRecordReader(split, taskContext)) {
reader.initialize(split, taskContext);
while (reader.nextKeyValue()) {
sample.add(batchConnector.transform(reader.getCurrentKey(), reader.getCurrentValue()));
}
} catch (InterruptedException e) {
throw new IOException(String.format("Unable to read the values from the input format %s", inputFormatProvider.getInputFormatClassName()));
}
return sample;
}
Aggregations