Search in sources :

Example 11 with CSVReblockInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction in project systemml by apache.

the class CSVAssignRowIDMapper method configure.

@Override
@SuppressWarnings("deprecation")
public void configure(JobConf job) {
    byte thisIndex;
    try {
        // it doesn't make sense to have repeated file names in the input, since this is for reblock
        thisIndex = MRJobConfiguration.getInputMatrixIndexesInMapper(job).get(0);
        outKey.set(thisIndex);
        Path thisPath = new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE));
        FileSystem fs = IOUtilFunctions.getFileSystem(thisPath, job);
        thisPath = thisPath.makeQualified(fs);
        filename = thisPath.toString();
        String[] strs = job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT);
        Path headerPath = new Path(strs[thisIndex]).makeQualified(fs);
        headerFile = headerPath.toString().equals(filename);
        CSVReblockInstruction[] reblockInstructions = MRJobConfiguration.getCSVReblockInstructions(job);
        for (CSVReblockInstruction ins : reblockInstructions) if (ins.input == thisIndex) {
            delim = Pattern.quote(ins.delim);
            ignoreFirstLine = ins.hasHeader;
            break;
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) IOException(java.io.IOException)

Example 12 with CSVReblockInstruction

use of org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction in project systemml by apache.

the class CSVReblockMapper method configure.

@Override
@SuppressWarnings("deprecation")
public void configure(JobConf job) {
    super.configure(job);
    // get the number colums per block
    // load the offset mapping
    byte matrixIndex = representativeMatrixes.get(0);
    try {
        Path thisPath = new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE));
        FileSystem fs = IOUtilFunctions.getFileSystem(thisPath, job);
        thisPath = thisPath.makeQualified(fs);
        String filename = thisPath.toString();
        Path headerPath = new Path(job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT)[matrixIndex]).makeQualified(fs);
        if (headerPath.toString().equals(filename))
            headerFile = true;
        ByteWritable key = new ByteWritable();
        OffsetCount value = new OffsetCount();
        Path p = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
        SequenceFile.Reader reader = null;
        try {
            reader = new SequenceFile.Reader(fs, p, job);
            while (reader.next(key, value)) {
                if (key.get() == matrixIndex && filename.equals(value.filename))
                    offsetMap.put(value.fileOffset, value.count);
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    CSVReblockInstruction ins = csv_reblock_instructions.get(0).get(0);
    _delim = ins.delim;
    ignoreFirstLine = ins.hasHeader;
    idxRow = new IndexedBlockRow();
    int maxBclen = 0;
    for (ArrayList<CSVReblockInstruction> insv : csv_reblock_instructions) for (CSVReblockInstruction in : insv) {
        if (maxBclen < in.bclen)
            maxBclen = in.bclen;
    }
    // always dense since common csv usecase
    idxRow.getRow().data.reset(1, maxBclen, false);
}
Also used : Path(org.apache.hadoop.fs.Path) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) IOException(java.io.IOException) OffsetCount(org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) ByteWritable(org.apache.hadoop.io.ByteWritable)

Aggregations

CSVReblockInstruction (org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction)12 IOException (java.io.IOException)7 Path (org.apache.hadoop.fs.Path)7 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)7 FileSystem (org.apache.hadoop.fs.FileSystem)6 ReblockInstruction (org.apache.sysml.runtime.instructions.mr.ReblockInstruction)4 ArrayList (java.util.ArrayList)3 ByteWritable (org.apache.hadoop.io.ByteWritable)3 SequenceFile (org.apache.hadoop.io.SequenceFile)3 OffsetCount (org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount)3 HashSet (java.util.HashSet)2 JobConf (org.apache.hadoop.mapred.JobConf)2 DataGenMRInstruction (org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction)2 MRInstruction (org.apache.sysml.runtime.instructions.mr.MRInstruction)2 PMMJMRInstruction (org.apache.sysml.runtime.instructions.mr.PMMJMRInstruction)2 JobReturn (org.apache.sysml.runtime.matrix.JobReturn)2 Group (org.apache.hadoop.mapred.Counters.Group)1 RunningJob (org.apache.hadoop.mapred.RunningJob)1 DMLConfig (org.apache.sysml.conf.DMLConfig)1 AssignRowIDMRReturn (org.apache.sysml.runtime.matrix.CSVReblockMR.AssignRowIDMRReturn)1