use of org.apache.hadoop.mapreduce.InputFormat in project hadoop by apache.
the class TestMultipleInputs method testAddInputPathWithFormat.
@Test
public void testAddInputPathWithFormat() throws IOException {
final Job conf = Job.getInstance();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class);
MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class);
final Map<Path, InputFormat> inputs = MultipleInputs.getInputFormatMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")).getClass());
}
use of org.apache.hadoop.mapreduce.InputFormat in project hadoop by apache.
the class InputSampler method writePartitionFile.
/**
* Write a partition file for the given job, using the Sampler provided.
* Queries the sampler for a sample keyset, sorts by the output key
* comparator, selects the keys for each rank, and writes to the destination
* returned from {@link TotalOrderPartitioner#getPartitionFile}.
*/
// getInputFormat, getOutputKeyComparator
@SuppressWarnings("unchecked")
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = job.getConfiguration();
final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
int numPartitions = job.getNumReduceTasks();
K[] samples = (K[]) sampler.getSample(inf, job);
LOG.info("Using " + samples.length + " samples");
RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
Arrays.sort(samples, comparator);
Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
FileSystem fs = dst.getFileSystem(conf);
fs.delete(dst, false);
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
NullWritable nullValue = NullWritable.get();
float stepSize = samples.length / (float) numPartitions;
int last = -1;
for (int i = 1; i < numPartitions; ++i) {
int k = Math.round(stepSize * i);
while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
++k;
}
writer.append(samples[k], nullValue);
last = k;
}
writer.close();
}
use of org.apache.hadoop.mapreduce.InputFormat in project asterixdb by apache.
the class DataflowTest method testHDFSReadWriteOperators.
/**
* Test a job with only HDFS read and writes.
*
* @throws Exception
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public void testHDFSReadWriteOperators() throws Exception {
FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
FileOutputFormat.setOutputPath(conf, new Path(HDFS_OUTPUT_PATH));
conf.setInputFormatClass(TextInputFormat.class);
Scheduler scheduler = new Scheduler(HyracksUtils.CC_HOST, HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT);
InputFormat inputFormat = ReflectionUtils.newInstance(conf.getInputFormatClass(), getConfiguration());
List<InputSplit> splits = inputFormat.getSplits(conf);
String[] readSchedule = scheduler.getLocationConstraints(splits);
JobSpecification jobSpec = new JobSpecification();
RecordDescriptor recordDesc = new RecordDescriptor(new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() });
String[] locations = new String[] { HyracksUtils.NC1_ID, HyracksUtils.NC1_ID, HyracksUtils.NC2_ID, HyracksUtils.NC2_ID };
HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(jobSpec, recordDesc, conf, splits, readSchedule, new TextKeyValueParserFactory());
PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, readOperator, locations);
ExternalSortOperatorDescriptor sortOperator = new ExternalSortOperatorDescriptor(jobSpec, 10, new int[] { 0 }, new IBinaryComparatorFactory[] { RawBinaryComparatorFactory.INSTANCE }, recordDesc);
PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, sortOperator, locations);
HDFSWriteOperatorDescriptor writeOperator = new HDFSWriteOperatorDescriptor(jobSpec, conf, new TextTupleWriterFactory());
PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, writeOperator, HyracksUtils.NC1_ID);
jobSpec.connect(new OneToOneConnectorDescriptor(jobSpec), readOperator, 0, sortOperator, 0);
jobSpec.connect(new MToNPartitioningMergingConnectorDescriptor(jobSpec, new FieldHashPartitionComputerFactory(new int[] { 0 }, new IBinaryHashFunctionFactory[] { RawBinaryHashFunctionFactory.INSTANCE }), new int[] { 0 }, new IBinaryComparatorFactory[] { RawBinaryComparatorFactory.INSTANCE }, null), sortOperator, 0, writeOperator, 0);
jobSpec.addRoot(writeOperator);
IHyracksClientConnection client = new HyracksConnection(HyracksUtils.CC_HOST, HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT);
JobId jobId = client.startJob(jobSpec);
client.waitForCompletion(jobId);
Assert.assertEquals(true, checkResults());
}
use of org.apache.hadoop.mapreduce.InputFormat in project cdap by caskdata.
the class MultiInputFormat method getSplits.
@Override
@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
List<InputSplit> splits = new ArrayList<>();
Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration());
for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) {
String inputName = mapperInputEntry.getKey();
MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue();
String mapperClassName = mapperInput.getMapperClassName();
Job jobCopy = new Job(job.getConfiguration());
Configuration confCopy = jobCopy.getConfiguration();
// set configuration specific for this input onto the jobCopy
ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy);
Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName());
Preconditions.checkNotNull(inputFormatClass, "Class could not be found: %s", mapperInput.getInputFormatClassName());
InputFormat<K, V> inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy);
// some input format need a jobId to getSplits
jobCopy.setJobID(new JobID(inputName, inputName.hashCode()));
// Get splits for each input path and tag with InputFormat
// and Mapper types by wrapping in a MultiInputTaggedSplit.
List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy);
for (InputSplit split : formatSplits) {
splits.add(new MultiInputTaggedSplit(split, confCopy, inputName, mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName));
}
}
return splits;
}
use of org.apache.hadoop.mapreduce.InputFormat in project druid by druid-io.
the class BaseParquetInputTest method getAllRows.
static List<InputRow> getAllRows(String parserType, HadoopDruidIndexerConfig config) throws IOException, InterruptedException {
Job job = Job.getInstance(new Configuration());
config.intoConfiguration(job);
File testFile = new File(((StaticPathSpec) config.getPathSpec()).getPaths());
Path path = new Path(testFile.getAbsoluteFile().toURI());
FileSplit split = new FileSplit(path, 0, testFile.length(), null);
InputFormat inputFormat = ReflectionUtils.newInstance(INPUT_FORMAT_CLASSES.get(parserType), job.getConfiguration());
TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
try (RecordReader reader = inputFormat.createRecordReader(split, context)) {
List<InputRow> records = new ArrayList<>();
InputRowParser parser = config.getParser();
reader.initialize(split, context);
while (reader.nextKeyValue()) {
reader.nextKeyValue();
Object data = reader.getCurrentValue();
records.add(((List<InputRow>) parser.parseBatch(data)).get(0));
}
return records;
}
}
Aggregations