Search in sources :

Example 1 with CombineSequenceFileInputFormat

use of org.apache.hadoop.mapred.lib.CombineSequenceFileInputFormat in project hadoop by apache.

the class TestCombineSequenceFileInputFormat method testFormat.

@Test(timeout = 10000)
public void testFormat() throws Exception {
    JobConf job = new JobConf(conf);
    Reporter reporter = Reporter.NULL;
    Random random = new Random();
    long seed = random.nextLong();
    LOG.info("seed = " + seed);
    random.setSeed(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int length = 10000;
    final int numFiles = 10;
    // create a file with various lengths
    createFiles(length, numFiles, random);
    // create a combine split for the files
    InputFormat<IntWritable, BytesWritable> format = new CombineSequenceFileInputFormat<IntWritable, BytesWritable>();
    IntWritable key = new IntWritable();
    BytesWritable value = new BytesWritable();
    for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(length / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        InputSplit[] splits = format.getSplits(job, numSplits);
        LOG.info("splitting: got =        " + splits.length);
        // we should have a single split as the length is comfortably smaller than
        // the block size
        assertEquals("We got more than one splits!", 1, splits.length);
        InputSplit split = splits[0];
        assertEquals("It should be CombineFileSplit", CombineFileSplit.class, split.getClass());
        // check each split
        BitSet bits = new BitSet(length);
        RecordReader<IntWritable, BytesWritable> reader = format.getRecordReader(split, job, reporter);
        try {
            while (reader.next(key, value)) {
                assertFalse("Key in multiple partitions.", bits.get(key.get()));
                bits.set(key.get());
            }
        } finally {
            reader.close();
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
    }
}
Also used : CombineSequenceFileInputFormat(org.apache.hadoop.mapred.lib.CombineSequenceFileInputFormat) BitSet(java.util.BitSet) BytesWritable(org.apache.hadoop.io.BytesWritable) Random(java.util.Random) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 2 with CombineSequenceFileInputFormat

use of org.apache.hadoop.mapred.lib.CombineSequenceFileInputFormat in project incubator-systemml by apache.

the class MRJobConfiguration method setUpMultipleInputs.

public static void setUpMultipleInputs(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, boolean[] distCacheOnly, boolean setConverter, ConvertTarget target) throws Exception {
    if (inputs.length != inputInfos.length)
        throw new Exception("number of inputs and inputInfos does not match");
    //set up names of the input matrices and their inputformat information
    job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs);
    MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes);
    //set up converter infos (converter determined implicitly)
    if (setConverter) {
        for (int i = 0; i < inputs.length; i++) setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target);
    }
    //remove redundant inputs and pure broadcast variables
    ArrayList<Path> lpaths = new ArrayList<Path>();
    ArrayList<InputInfo> liinfos = new ArrayList<InputInfo>();
    for (int i = 0; i < inputs.length; i++) {
        Path p = new Path(inputs[i]);
        //check and skip redundant inputs
        if (//path already included
        lpaths.contains(p) || //input only required in dist cache
        distCacheOnly[i]) {
            continue;
        }
        lpaths.add(p);
        liinfos.add(inputInfos[i]);
    }
    boolean combineInputFormat = false;
    if (OptimizerUtils.ALLOW_COMBINE_FILE_INPUT_FORMAT) {
        //determine total input sizes
        double totalInputSize = 0;
        for (int i = 0; i < inputs.length; i++) totalInputSize += MapReduceTool.getFilesizeOnHDFS(new Path(inputs[i]));
        //set max split size (default blocksize) to 2x blocksize if (1) sort buffer large enough, 
        //(2) degree of parallelism not hurt, and only a single input (except broadcasts)
        //(the sort buffer size is relevant for pass-through of, potentially modified, inputs to the reducers)
        //(the single input constraint stems from internal runtime assumptions used to relate meta data to inputs)
        long sizeSortBuff = InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer();
        long sizeHDFSBlk = InfrastructureAnalyzer.getHDFSBlockSize();
        //use generic config api for backwards compatibility
        long newSplitSize = sizeHDFSBlk * 2;
        double spillPercent = Double.parseDouble(job.get(MRConfigurationNames.MR_MAP_SORT_SPILL_PERCENT, "1.0"));
        int numPMap = OptimizerUtils.getNumMappers();
        if (numPMap < totalInputSize / newSplitSize && sizeSortBuff * spillPercent >= newSplitSize && lpaths.size() == 1) {
            job.setLong(MRConfigurationNames.MR_INPUT_FILEINPUTFORMAT_SPLIT_MAXSIZE, newSplitSize);
            combineInputFormat = true;
        }
    }
    //add inputs to jobs input (incl input format configuration)
    for (int i = 0; i < lpaths.size(); i++) {
        //add input to job inputs (for binaryblock we use CombineSequenceFileInputFormat to reduce task latency)
        if (combineInputFormat && liinfos.get(i) == InputInfo.BinaryBlockInputInfo)
            MultipleInputs.addInputPath(job, lpaths.get(i), CombineSequenceFileInputFormat.class);
        else
            MultipleInputs.addInputPath(job, lpaths.get(i), liinfos.get(i).inputFormatClass);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) CombineSequenceFileInputFormat(org.apache.hadoop.mapred.lib.CombineSequenceFileInputFormat) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException)

Aggregations

CombineSequenceFileInputFormat (org.apache.hadoop.mapred.lib.CombineSequenceFileInputFormat)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 BitSet (java.util.BitSet)1 Random (java.util.Random)1 Path (org.apache.hadoop.fs.Path)1 BytesWritable (org.apache.hadoop.io.BytesWritable)1 IntWritable (org.apache.hadoop.io.IntWritable)1 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)1 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)1 Test (org.junit.Test)1