Search in sources :

Example 1 with InputFormat

use of org.apache.hadoop.mapreduce.InputFormat in project hadoop by apache.

the class TestMultipleInputs method testAddInputPathWithFormat.

@Test
public void testAddInputPathWithFormat() throws IOException {
    final Job conf = Job.getInstance();
    MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class);
    MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class);
    final Map<Path, InputFormat> inputs = MultipleInputs.getInputFormatMap(conf);
    assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
    assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")).getClass());
}
Also used : Path(org.apache.hadoop.fs.Path) InputFormat(org.apache.hadoop.mapreduce.InputFormat) KeyValueTextInputFormat(org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Example 2 with InputFormat

use of org.apache.hadoop.mapreduce.InputFormat in project hadoop by apache.

the class InputSampler method writePartitionFile.

/**
   * Write a partition file for the given job, using the Sampler provided.
   * Queries the sampler for a sample keyset, sorts by the output key
   * comparator, selects the keys for each rank, and writes to the destination
   * returned from {@link TotalOrderPartitioner#getPartitionFile}.
   */
// getInputFormat, getOutputKeyComparator
@SuppressWarnings("unchecked")
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = job.getConfiguration();
    final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    int numPartitions = job.getNumReduceTasks();
    K[] samples = (K[]) sampler.getSample(inf, job);
    LOG.info("Using " + samples.length + " samples");
    RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
    Arrays.sort(samples, comparator);
    Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
    FileSystem fs = dst.getFileSystem(conf);
    fs.delete(dst, false);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    float stepSize = samples.length / (float) numPartitions;
    int last = -1;
    for (int i = 1; i < numPartitions; ++i) {
        int k = Math.round(stepSize * i);
        while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
            ++k;
        }
        writer.append(samples[k], nullValue);
        last = k;
    }
    writer.close();
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) NullWritable(org.apache.hadoop.io.NullWritable) RawComparator(org.apache.hadoop.io.RawComparator) SequenceFile(org.apache.hadoop.io.SequenceFile) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) InputFormat(org.apache.hadoop.mapreduce.InputFormat) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 3 with InputFormat

use of org.apache.hadoop.mapreduce.InputFormat in project asterixdb by apache.

the class DataflowTest method testHDFSReadWriteOperators.

/**
     * Test a job with only HDFS read and writes.
     *
     * @throws Exception
     */
@SuppressWarnings({ "rawtypes", "unchecked" })
public void testHDFSReadWriteOperators() throws Exception {
    FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
    FileOutputFormat.setOutputPath(conf, new Path(HDFS_OUTPUT_PATH));
    conf.setInputFormatClass(TextInputFormat.class);
    Scheduler scheduler = new Scheduler(HyracksUtils.CC_HOST, HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT);
    InputFormat inputFormat = ReflectionUtils.newInstance(conf.getInputFormatClass(), getConfiguration());
    List<InputSplit> splits = inputFormat.getSplits(conf);
    String[] readSchedule = scheduler.getLocationConstraints(splits);
    JobSpecification jobSpec = new JobSpecification();
    RecordDescriptor recordDesc = new RecordDescriptor(new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() });
    String[] locations = new String[] { HyracksUtils.NC1_ID, HyracksUtils.NC1_ID, HyracksUtils.NC2_ID, HyracksUtils.NC2_ID };
    HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(jobSpec, recordDesc, conf, splits, readSchedule, new TextKeyValueParserFactory());
    PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, readOperator, locations);
    ExternalSortOperatorDescriptor sortOperator = new ExternalSortOperatorDescriptor(jobSpec, 10, new int[] { 0 }, new IBinaryComparatorFactory[] { RawBinaryComparatorFactory.INSTANCE }, recordDesc);
    PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, sortOperator, locations);
    HDFSWriteOperatorDescriptor writeOperator = new HDFSWriteOperatorDescriptor(jobSpec, conf, new TextTupleWriterFactory());
    PartitionConstraintHelper.addAbsoluteLocationConstraint(jobSpec, writeOperator, HyracksUtils.NC1_ID);
    jobSpec.connect(new OneToOneConnectorDescriptor(jobSpec), readOperator, 0, sortOperator, 0);
    jobSpec.connect(new MToNPartitioningMergingConnectorDescriptor(jobSpec, new FieldHashPartitionComputerFactory(new int[] { 0 }, new IBinaryHashFunctionFactory[] { RawBinaryHashFunctionFactory.INSTANCE }), new int[] { 0 }, new IBinaryComparatorFactory[] { RawBinaryComparatorFactory.INSTANCE }, null), sortOperator, 0, writeOperator, 0);
    jobSpec.addRoot(writeOperator);
    IHyracksClientConnection client = new HyracksConnection(HyracksUtils.CC_HOST, HyracksUtils.TEST_HYRACKS_CC_CLIENT_PORT);
    JobId jobId = client.startJob(jobSpec);
    client.waitForCompletion(jobId);
    Assert.assertEquals(true, checkResults());
}
Also used : Path(org.apache.hadoop.fs.Path) IHyracksClientConnection(org.apache.hyracks.api.client.IHyracksClientConnection) Scheduler(org.apache.hyracks.hdfs2.scheduler.Scheduler) RecordDescriptor(org.apache.hyracks.api.dataflow.value.RecordDescriptor) MToNPartitioningMergingConnectorDescriptor(org.apache.hyracks.dataflow.std.connectors.MToNPartitioningMergingConnectorDescriptor) IBinaryComparatorFactory(org.apache.hyracks.api.dataflow.value.IBinaryComparatorFactory) OneToOneConnectorDescriptor(org.apache.hyracks.dataflow.std.connectors.OneToOneConnectorDescriptor) UTF8StringSerializerDeserializer(org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer) FieldHashPartitionComputerFactory(org.apache.hyracks.dataflow.common.data.partition.FieldHashPartitionComputerFactory) TextTupleWriterFactory(org.apache.hyracks.hdfs.lib.TextTupleWriterFactory) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) InputFormat(org.apache.hadoop.mapreduce.InputFormat) ExternalSortOperatorDescriptor(org.apache.hyracks.dataflow.std.sort.ExternalSortOperatorDescriptor) JobSpecification(org.apache.hyracks.api.job.JobSpecification) HyracksConnection(org.apache.hyracks.api.client.HyracksConnection) InputSplit(org.apache.hadoop.mapreduce.InputSplit) TextKeyValueParserFactory(org.apache.hyracks.hdfs.lib.TextKeyValueParserFactory) JobId(org.apache.hyracks.api.job.JobId)

Example 4 with InputFormat

use of org.apache.hadoop.mapreduce.InputFormat in project cdap by caskdata.

the class MultiInputFormat method getSplits.

@Override
@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = new ArrayList<>();
    Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration());
    for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) {
        String inputName = mapperInputEntry.getKey();
        MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue();
        String mapperClassName = mapperInput.getMapperClassName();
        Job jobCopy = new Job(job.getConfiguration());
        Configuration confCopy = jobCopy.getConfiguration();
        // set configuration specific for this input onto the jobCopy
        ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy);
        Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName());
        Preconditions.checkNotNull(inputFormatClass, "Class could not be found: %s", mapperInput.getInputFormatClassName());
        InputFormat<K, V> inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy);
        // some input format need a jobId to getSplits
        jobCopy.setJobID(new JobID(inputName, inputName.hashCode()));
        // Get splits for each input path and tag with InputFormat
        // and Mapper types by wrapping in a MultiInputTaggedSplit.
        List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy);
        for (InputSplit split : formatSplits) {
            splits.add(new MultiInputTaggedSplit(split, confCopy, inputName, mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName));
        }
    }
    return splits;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) InputFormat(org.apache.hadoop.mapreduce.InputFormat) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Map(java.util.Map) JobID(org.apache.hadoop.mapred.JobID)

Example 5 with InputFormat

use of org.apache.hadoop.mapreduce.InputFormat in project druid by druid-io.

the class BaseParquetInputTest method getAllRows.

static List<InputRow> getAllRows(String parserType, HadoopDruidIndexerConfig config) throws IOException, InterruptedException {
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);
    File testFile = new File(((StaticPathSpec) config.getPathSpec()).getPaths());
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);
    InputFormat inputFormat = ReflectionUtils.newInstance(INPUT_FORMAT_CLASSES.get(parserType), job.getConfiguration());
    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    try (RecordReader reader = inputFormat.createRecordReader(split, context)) {
        List<InputRow> records = new ArrayList<>();
        InputRowParser parser = config.getParser();
        reader.initialize(split, context);
        while (reader.nextKeyValue()) {
            reader.nextKeyValue();
            Object data = reader.getCurrentValue();
            records.add(((List<InputRow>) parser.parseBatch(data)).get(0));
        }
        return records;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) RecordReader(org.apache.hadoop.mapreduce.RecordReader) ArrayList(java.util.ArrayList) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) InputFormat(org.apache.hadoop.mapreduce.InputFormat) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) InputRow(org.apache.druid.data.input.InputRow) InputRowParser(org.apache.druid.data.input.impl.InputRowParser) Job(org.apache.hadoop.mapreduce.Job) File(java.io.File)

Aggregations

InputFormat (org.apache.hadoop.mapreduce.InputFormat)21 Configuration (org.apache.hadoop.conf.Configuration)11 Path (org.apache.hadoop.fs.Path)11 InputSplit (org.apache.hadoop.mapreduce.InputSplit)10 Job (org.apache.hadoop.mapreduce.Job)9 RecordReader (org.apache.hadoop.mapreduce.RecordReader)9 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)9 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)7 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)7 Test (org.junit.Test)7 ArrayList (java.util.ArrayList)5 HashMap (java.util.HashMap)3 Map (java.util.Map)3 Mapper (org.apache.hadoop.mapreduce.Mapper)3 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)3 File (java.io.File)2 List (java.util.List)2 KV (org.apache.beam.sdk.values.KV)2 Text (org.apache.hadoop.io.Text)2 FileInputFormat (org.apache.hadoop.mapreduce.lib.input.FileInputFormat)2