Search in sources :

Example 1 with OffsetCount

use of org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount in project incubator-systemml by apache.

the class TfUtils method getPartFileID.

/**
	 * Function to generate custom file names (transform-part-.....) for
	 * mappers' output for ApplyTfCSV job. The idea is to find the index 
	 * of (thisfile, fileoffset) in the list of all offsets from the 
	 * counters/offsets file, which was generated from either GenTfMtdMR
	 * or AssignRowIDMR job.
	 * 
	 * @param job job configuration
	 * @param offset file offset
	 * @return part file id (ie, 00001, 00002, etc)
	 * @throws IOException if IOException occurs
	 */
public String getPartFileID(JobConf job, long offset) throws IOException {
    Reader reader = null;
    int id = 0;
    try {
        reader = initOffsetsReader(job);
        ByteWritable key = new ByteWritable();
        OffsetCount value = new OffsetCount();
        String thisFile = TfUtils.getPartFileName(job);
        while (reader.next(key, value)) {
            if (thisFile.equals(value.filename) && value.fileOffset == offset)
                break;
            id++;
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
    String sid = Integer.toString(id);
    char[] carr = new char[5 - sid.length()];
    Arrays.fill(carr, '0');
    String ret = (new String(carr)).concat(sid);
    return ret;
}
Also used : OffsetCount(org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount) MatrixReader(org.apache.sysml.runtime.io.MatrixReader) Reader(org.apache.hadoop.io.SequenceFile.Reader) ByteWritable(org.apache.hadoop.io.ByteWritable)

Example 2 with OffsetCount

use of org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount in project incubator-systemml by apache.

the class GTFMTDReducer method reduce.

@Override
public void reduce(IntWritable key, Iterator<DistinctValue> values, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException {
    FileSystem fs = FileSystem.get(_rJob);
    int colID = key.get();
    if (colID < 0) {
        // process mapper output for MV and Bin agents
        colID = colID * -1;
        _agents.getMVImputeAgent().mergeAndOutputTransformationMetadata(values, _agents.getTfMtdDir(), colID, fs, _agents);
    } else if (colID == _agents.getNumCols() + 1) {
        // process mapper output for OFFSET_FILE
        ArrayList<OffsetCount> list = new ArrayList<OffsetCount>();
        while (values.hasNext()) list.add(new OffsetCount(values.next().getOffsetCount()));
        long numTfRows = generateOffsetsFile(list);
        reporter.incrCounter(MRJobConfiguration.DataTransformCounters.TRANSFORMED_NUM_ROWS, numTfRows);
    } else {
        // process mapper output for Recode agent
        _agents.getRecodeAgent().mergeAndOutputTransformationMetadata(values, _agents.getTfMtdDir(), colID, fs, _agents);
    }
}
Also used : OffsetCount(org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList)

Example 3 with OffsetCount

use of org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount in project incubator-systemml by apache.

the class ApplyTfBBMapper method configure.

@Override
public void configure(JobConf job) {
    super.configure(job);
    try {
        _partFileWithHeader = TfUtils.isPartFileWithHeader(job);
        tfmapper = new TfUtils(job);
        tfmapper.loadTfMetadata(job, true);
        // Load relevant information for CSV Reblock
        ByteWritable key = new ByteWritable();
        OffsetCount value = new OffsetCount();
        Path p = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
        Path path = new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE));
        FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
        String thisfile = path.makeQualified(fs).toString();
        SequenceFile.Reader reader = null;
        try {
            reader = new SequenceFile.Reader(fs, p, job);
            while (reader.next(key, value)) {
                // "key" needn't be checked since the offset file has information about a single CSV input (the raw data file)
                if (thisfile.equals(value.filename))
                    offsetMap.put(value.fileOffset, value.count);
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
        idxRow = new CSVReblockMapper.IndexedBlockRow();
        int maxBclen = 0;
        for (ArrayList<CSVReblockInstruction> insv : csv_reblock_instructions) for (CSVReblockInstruction in : insv) {
            if (maxBclen < in.bclen)
                maxBclen = in.bclen;
        }
        //always dense since common csv usecase
        idxRow.getRow().data.reset(1, maxBclen, false);
    } catch (IOException e) {
        throw new RuntimeException(e);
    } catch (JSONException e) {
        throw new RuntimeException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) CSVReblockMapper(org.apache.sysml.runtime.matrix.mapred.CSVReblockMapper) IndexedBlockRow(org.apache.sysml.runtime.matrix.mapred.CSVReblockMapper.IndexedBlockRow) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) JSONException(org.apache.wink.json4j.JSONException) IOException(java.io.IOException) OffsetCount(org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) ByteWritable(org.apache.hadoop.io.ByteWritable)

Example 4 with OffsetCount

use of org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount in project incubator-systemml by apache.

the class CSVAssignRowIDReducer method reduce.

@Override
@SuppressWarnings("unchecked")
public void reduce(ByteWritable key, Iterator<OffsetCount> values, OutputCollector<ByteWritable, OffsetCount> out, Reporter report) throws IOException {
    //need to sort the values by filename and fileoffset
    while (values.hasNext()) list.add(new OffsetCount(values.next()));
    Collections.sort(list);
    long lineOffset = 0;
    for (OffsetCount oc : list) {
        long count = oc.count;
        oc.count = lineOffset;
        out.collect(key, oc);
        lineOffset += count;
    }
    report.incrCounter(CSVReblockMR.NUM_ROWS_IN_MATRIX, key.toString(), lineOffset);
    list.clear();
}
Also used : OffsetCount(org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount)

Example 5 with OffsetCount

use of org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount in project incubator-systemml by apache.

the class CSVReblockMapper method configure.

@Override
@SuppressWarnings("deprecation")
public void configure(JobConf job) {
    super.configure(job);
    //get the number colums per block
    //load the offset mapping
    byte matrixIndex = representativeMatrixes.get(0);
    try {
        Path thisPath = new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE));
        FileSystem fs = IOUtilFunctions.getFileSystem(thisPath, job);
        thisPath = thisPath.makeQualified(fs);
        String filename = thisPath.toString();
        Path headerPath = new Path(job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT)[matrixIndex]).makeQualified(fs);
        if (headerPath.toString().equals(filename))
            headerFile = true;
        ByteWritable key = new ByteWritable();
        OffsetCount value = new OffsetCount();
        Path p = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME));
        SequenceFile.Reader reader = null;
        try {
            reader = new SequenceFile.Reader(fs, p, job);
            while (reader.next(key, value)) {
                if (key.get() == matrixIndex && filename.equals(value.filename))
                    offsetMap.put(value.fileOffset, value.count);
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    CSVReblockInstruction ins = csv_reblock_instructions.get(0).get(0);
    _delim = ins.delim;
    ignoreFirstLine = ins.hasHeader;
    idxRow = new IndexedBlockRow();
    int maxBclen = 0;
    for (ArrayList<CSVReblockInstruction> insv : csv_reblock_instructions) for (CSVReblockInstruction in : insv) {
        if (maxBclen < in.bclen)
            maxBclen = in.bclen;
    }
    //always dense since common csv usecase
    idxRow.getRow().data.reset(1, maxBclen, false);
}
Also used : Path(org.apache.hadoop.fs.Path) CSVReblockInstruction(org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction) IOException(java.io.IOException) OffsetCount(org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) ByteWritable(org.apache.hadoop.io.ByteWritable)

Aggregations

OffsetCount (org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount)7 ByteWritable (org.apache.hadoop.io.ByteWritable)4 FileSystem (org.apache.hadoop.fs.FileSystem)3 Path (org.apache.hadoop.fs.Path)3 SequenceFile (org.apache.hadoop.io.SequenceFile)3 IOException (java.io.IOException)2 CSVReblockInstruction (org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction)2 ArrayList (java.util.ArrayList)1 IntWritable (org.apache.hadoop.io.IntWritable)1 Reader (org.apache.hadoop.io.SequenceFile.Reader)1 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)1 MatrixReader (org.apache.sysml.runtime.io.MatrixReader)1 CSVReblockMapper (org.apache.sysml.runtime.matrix.mapred.CSVReblockMapper)1 IndexedBlockRow (org.apache.sysml.runtime.matrix.mapred.CSVReblockMapper.IndexedBlockRow)1 JSONException (org.apache.wink.json4j.JSONException)1