Search in sources :

Example 36 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project carbondata by apache.

the class MapredCarbonInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    org.apache.hadoop.mapreduce.JobContext jobContext = Job.getInstance(jobConf);
    List<org.apache.hadoop.mapreduce.InputSplit> splitList = super.getSplits(jobContext);
    InputSplit[] splits = new InputSplit[splitList.size()];
    CarbonInputSplit split = null;
    for (int i = 0; i < splitList.size(); i++) {
        split = (CarbonInputSplit) splitList.get(i);
        splits[i] = new CarbonHiveInputSplit(split.getSegmentId(), split.getPath(), split.getStart(), split.getLength(), split.getLocations(), split.getNumberOfBlocklets(), split.getVersion(), split.getBlockStorageIdMap());
    }
    return splits;
}
Also used : CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 37 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project cdap by caskdata.

the class DatasetInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) {
        try {
            datasetAccessor.initialize();
        } catch (Exception e) {
            throw new IOException("Could not get dataset", e);
        }
        try (RecordScannable recordScannable = datasetAccessor.getDataset()) {
            Job job = new Job(jobConf);
            JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
            Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
            List<Split> dsSplits = recordScannable.getSplits();
            InputSplit[] inputSplits = new InputSplit[dsSplits.size()];
            for (int i = 0; i < dsSplits.size(); i++) {
                inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]);
            }
            return inputSplits;
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) IOException(java.io.IOException) RecordScannable(co.cask.cdap.api.data.batch.RecordScannable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) Split(co.cask.cdap.api.data.batch.Split) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 38 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project drill by apache.

the class HiveSubScan method deserializeInputSplit.

public static InputSplit deserializeInputSplit(String base64, String className) throws IOException, ReflectiveOperationException {
    Constructor<?> constructor = Class.forName(className).getDeclaredConstructor();
    if (constructor == null) {
        throw new ReflectiveOperationException("Class " + className + " does not implement a default constructor.");
    }
    constructor.setAccessible(true);
    InputSplit split = (InputSplit) constructor.newInstance();
    ByteArrayDataInput byteArrayDataInput = ByteStreams.newDataInput(Base64.decodeBase64(base64));
    split.readFields(byteArrayDataInput);
    return split;
}
Also used : ByteArrayDataInput(com.google.common.io.ByteArrayDataInput) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 39 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class ResultMergeLocalFile method createTextCellStagingFile.

private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    LinkedList<Cell> buffer = new LinkedList<>();
    LongWritable key = new LongWritable();
    Text value = new Text();
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    // long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    // NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
    FastStringTokenizer st = new FastStringTokenizer(' ');
    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                // reset tokenizer
                st.reset(value.toString());
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());
                Cell tmp = new Cell(row, col, lvalue);
                buffer.addLast(tmp);
                if (// periodic flush
                buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }
            // final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) LinkedList(java.util.LinkedList) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell)

Example 40 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project incubator-systemml by apache.

the class DataPartitionerLocal method partitionTextCell.

private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) {
    long row = -1;
    long col = -1;
    try {
        // STEP 1: read matrix from HDFS and write blocks to local staging area
        // check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);
        LinkedList<Cell> buffer = new LinkedList<>();
        LongWritable key = new LongWritable();
        Text value = new Text();
        FastStringTokenizer st = new FastStringTokenizer(' ');
        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                while (reader.next(key, value)) {
                    // reset tokenizer
                    st.reset(value.toString());
                    row = st.nextLong();
                    col = st.nextLong();
                    double lvalue = st.nextDouble();
                    Cell tmp = new Cell(row, col, lvalue);
                    buffer.addLast(tmp);
                    if (// periodic flush
                    buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                }
                // final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                    buffer.clear();
                }
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }
        // STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end));
                threads[i].start();
            }
            for (Thread t : threads) t.join();
        } else {
            for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
        }
    } catch (Exception e) {
        // post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) LinkedList(java.util.LinkedList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell) SequenceFile(org.apache.hadoop.io.SequenceFile) File(java.io.File)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8