Search in sources :

Example 6 with CSVReblockInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction in project incubator-systemml by apache.

the class CSVAssignRowIDMapper method configure.

@Override
@SuppressWarnings("deprecation")
public void configure(JobConf job) {
    byte thisIndex;
    try {
        //it doesn't make sense to have repeated file names in the input, since this is for reblock
        thisIndex = MRJobConfiguration.getInputMatrixIndexesInMapper(job).get(0);
        outKey.set(thisIndex);
        Path thisPath = new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE));
        FileSystem fs = IOUtilFunctions.getFileSystem(thisPath, job);
        thisPath = thisPath.makeQualified(fs);
        filename = thisPath.toString();
        String[] strs = job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT);
        Path headerPath = new Path(strs[thisIndex]).makeQualified(fs);
        headerFile = headerPath.toString().equals(filename);
        CSVReblockInstruction[] reblockInstructions = MRJobConfiguration.getCSVReblockInstructions(job);
        for (CSVReblockInstruction ins : reblockInstructions) if (ins.input == thisIndex) {
            delim = Pattern.quote(ins.delim);
            ignoreFirstLine = ins.hasHeader;
            break;
        }
        // load properties relevant to transform
        boolean omit = job.getBoolean(MRJobConfiguration.TF_TRANSFORM, false);
        if (omit)
            _agents = new TfUtils(job, true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TfUtils(org.apache.sysml.runtime.transform.TfUtils) FileSystem(org.apache.hadoop.fs.FileSystem) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) IOException(java.io.IOException)

Example 7 with CSVReblockInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction in project incubator-systemml by apache.

the class CSVReblockReducer method configure.

@Override
public void configure(JobConf job) {
    MRJobConfiguration.setMatrixValueClass(job, true);
    super.configure(job);
    //parse the reblock instructions 
    CSVReblockInstruction[] reblockInstructions;
    try {
        reblockInstructions = MRJobConfiguration.getCSVReblockInstructions(job);
    } catch (DMLRuntimeException e) {
        throw new RuntimeException(e);
    }
    for (ReblockInstruction ins : reblockInstructions) dimensions.put(ins.output, MRJobConfiguration.getMatrixCharactristicsForReblock(job, ins.output));
}
Also used : DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) ReblockInstruction(org.apache.sysml.runtime.instructions.mr.ReblockInstruction) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 8 with CSVReblockInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction in project incubator-systemml by apache.

the class CSVReblockMapper method configure.

@Override
@SuppressWarnings("deprecation")
public void configure(JobConf job) {
    super.configure(job);
    //get the number colums per block
    //load the offset mapping
    byte matrixIndex = representativeMatrixes.get(0);
    try {
        Path thisPath = new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE));
        FileSystem fs = IOUtilFunctions.getFileSystem(thisPath, job);
        thisPath = thisPath.makeQualified(fs);
        String filename = thisPath.toString();
        Path headerPath = new Path(job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT)[matrixIndex]).makeQualified(fs);
        if (headerPath.toString().equals(filename))
            headerFile = true;
        ByteWritable key = new ByteWritable();
        OffsetCount value = new OffsetCount();
        Path p = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
        SequenceFile.Reader reader = null;
        try {
            reader = new SequenceFile.Reader(fs, p, job);
            while (reader.next(key, value)) {
                if (key.get() == matrixIndex && filename.equals(value.filename))
                    offsetMap.put(value.fileOffset, value.count);
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    CSVReblockInstruction ins = csv_reblock_instructions.get(0).get(0);
    _delim = ins.delim;
    ignoreFirstLine = ins.hasHeader;
    idxRow = new IndexedBlockRow();
    int maxBclen = 0;
    for (ArrayList<CSVReblockInstruction> insv : csv_reblock_instructions) for (CSVReblockInstruction in : insv) {
        if (maxBclen < in.bclen)
            maxBclen = in.bclen;
    }
    //always dense since common csv usecase
    idxRow.getRow().data.reset(1, maxBclen, false);
}
Also used : Path(org.apache.hadoop.fs.Path) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) IOException(java.io.IOException) OffsetCount(org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) ByteWritable(org.apache.hadoop.io.ByteWritable)

Aggregations

CSVReblockInstruction (org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction)8 Path (org.apache.hadoop.fs.Path)5 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)5 IOException (java.io.IOException)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 ArrayList (java.util.ArrayList)2 ByteWritable (org.apache.hadoop.io.ByteWritable)2 SequenceFile (org.apache.hadoop.io.SequenceFile)2 JobConf (org.apache.hadoop.mapred.JobConf)2 ReblockInstruction (org.apache.sysml.runtime.instructions.mr.ReblockInstruction)2 OffsetCount (org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount)2 JobReturn (org.apache.sysml.runtime.matrix.JobReturn)2 HashSet (java.util.HashSet)1 Group (org.apache.hadoop.mapred.Counters.Group)1 RunningJob (org.apache.hadoop.mapred.RunningJob)1 DMLConfig (org.apache.sysml.conf.DMLConfig)1 DataGenMRInstruction (org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction)1 MRInstruction (org.apache.sysml.runtime.instructions.mr.MRInstruction)1 PMMJMRInstruction (org.apache.sysml.runtime.instructions.mr.PMMJMRInstruction)1 AssignRowIDMRReturn (org.apache.sysml.runtime.matrix.CSVReblockMR.AssignRowIDMRReturn)1