Search in sources :

Example 81 with LongWritable

use of org.apache.hadoop.io.LongWritable in project systemml by apache.

the class RemoteParForColocatedFileSplit method getLocations.

/**
 * Get the list of hostnames where the input split is located.
 */
@Override
public String[] getLocations() throws IOException {
    // Timing time = new Timing();
    // time.start();
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = IOUtilFunctions.getFileSystem(getPath(), job);
    // read task string
    LongWritable key = new LongWritable();
    Text value = new Text();
    RecordReader<LongWritable, Text> reader = null;
    try {
        reader = new NLineInputFormat().getRecordReader(this, job, Reporter.NULL);
        reader.next(key, value);
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
    // parse task
    Task t = Task.parseCompactString(value.toString());
    // get all locations
    HashMap<String, Integer> hosts = new HashMap<>();
    if (t.getType() == TaskType.SET) {
        for (IntObject val : t.getIterations()) {
            String fname = _fname + "/" + String.valueOf(((val.getLongValue() - 1) / _blen + 1));
            FileStatus status = fs.getFileStatus(new Path(fname));
            BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
            for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts());
        }
    } else // TaskType.RANGE
    {
        // since this is a serial process, we use just the first iteration
        // as a heuristic for location information
        long lFrom = t.getIterations().get(0).getLongValue();
        long lTo = t.getIterations().get(1).getLongValue();
        for (long li : new long[] { lFrom, lTo }) {
            String fname = _fname + "/" + String.valueOf(((li - 1) / _blen + 1));
            FileStatus status = fs.getFileStatus(new Path(fname));
            BlockLocation[] tmp1 = fs.getFileBlockLocations(status, 0, status.getLen());
            for (BlockLocation bl : tmp1) countHosts(hosts, bl.getHosts());
        }
    }
    // majority consensus on top host
    return getTopHosts(hosts);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HashMap(java.util.HashMap) Text(org.apache.hadoop.io.Text) BlockLocation(org.apache.hadoop.fs.BlockLocation) IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) FileSystem(org.apache.hadoop.fs.FileSystem) NLineInputFormat(org.apache.hadoop.mapred.lib.NLineInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf)

Example 82 with LongWritable

use of org.apache.hadoop.io.LongWritable in project systemml by apache.

the class ResultMergeLocalFile method createTextCellStagingFile.

private static void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    LinkedList<Cell> buffer = new LinkedList<>();
    LongWritable key = new LongWritable();
    Text value = new Text();
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    // long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    // NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)
    FastStringTokenizer st = new FastStringTokenizer(' ');
    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                // reset tokenizer
                st.reset(value.toString());
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());
                Cell tmp = new Cell(row, col, lvalue);
                buffer.addLast(tmp);
                if (// periodic flush
                buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }
            // final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) LinkedList(java.util.LinkedList) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) FastStringTokenizer(org.apache.sysml.runtime.util.FastStringTokenizer) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) MatrixCell(org.apache.sysml.runtime.matrix.data.MatrixCell) Cell(org.apache.sysml.runtime.controlprogram.parfor.util.Cell)

Example 83 with LongWritable

use of org.apache.hadoop.io.LongWritable in project systemml by apache.

the class ResultMergeLocalFile method mergeTextCellWithoutComp.

private static void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) {
    try {
        // delete target file if already exists
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        if (ALLOW_COPY_CELLFILES) {
            copyAllFiles(fnameNew, inMO);
            // we're done
            return;
        }
        // actual merge
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fnameNew);
        FileSystem fs = IOUtilFunctions.getFileSystem(path, job);
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
        String valueStr = null;
        try {
            for (// read/write all inputs
            MatrixObject in : // read/write all inputs
            inMO) {
                if (LOG.isTraceEnabled())
                    LOG.trace("ResultMerge (local, file): Merge input " + in.hashCode() + " (fname=" + in.getFileName() + ") via stream merge");
                JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
                Path tmpPath = new Path(in.getFileName());
                FileInputFormat.addInputPath(tmpJob, tmpPath);
                TextInputFormat informat = new TextInputFormat();
                informat.configure(tmpJob);
                InputSplit[] splits = informat.getSplits(tmpJob, 1);
                LongWritable key = new LongWritable();
                Text value = new Text();
                for (InputSplit split : splits) {
                    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL);
                    try {
                        while (reader.next(key, value)) {
                            valueStr = value.toString().trim();
                            out.write(valueStr + "\n");
                        }
                    } finally {
                        IOUtilFunctions.closeSilently(reader);
                    }
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(out);
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("Unable to merge text cell results.", ex);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Text(org.apache.hadoop.io.Text) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStreamWriter(java.io.OutputStreamWriter) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 84 with LongWritable

use of org.apache.hadoop.io.LongWritable in project systemml by apache.

the class MLContextConversionUtil method javaRDDStringCSVToMatrixObject.

/**
 * Convert a {@code JavaRDD<String>} in CSV format to a {@code MatrixObject}
 *
 * @param javaRDD
 *            the Java RDD of strings
 * @param matrixMetadata
 *            matrix metadata
 * @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
 */
public static MatrixObject javaRDDStringCSVToMatrixObject(JavaRDD<String> javaRDD, MatrixMetadata matrixMetadata) {
    JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
    MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.CSVOutputInfo, InputInfo.CSVInputInfo));
    JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
    matrixObject.setRDDHandle(new RDDObject(javaPairRDD2));
    return matrixObject;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 85 with LongWritable

use of org.apache.hadoop.io.LongWritable in project systemml by apache.

the class MLContextConversionUtil method javaRDDStringCSVToFrameObject.

/**
 * Convert a {@code JavaRDD<String>} in CSV format to a {@code FrameObject}
 *
 * @param javaRDD
 *            the Java RDD of strings
 * @param frameMetadata
 *            frame metadata
 * @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
 */
public static FrameObject javaRDDStringCSVToFrameObject(JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
    JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
    MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
    FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
    JavaPairRDD<Long, FrameBlock> rdd;
    try {
        rdd = FrameRDDConverterUtils.csvToBinaryBlock(jsc(), javaPairRDDText, mc, frameObject.getSchema(), false, ",", false, -1);
    } catch (DMLRuntimeException e) {
        e.printStackTrace();
        return null;
    }
    frameObject.setRDDHandle(new RDDObject(rdd));
    return frameObject;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) ValueType(org.apache.sysml.parser.Expression.ValueType) Text(org.apache.hadoop.io.Text) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) LongWritable(org.apache.hadoop.io.LongWritable)

Aggregations

LongWritable (org.apache.hadoop.io.LongWritable)445 Text (org.apache.hadoop.io.Text)220 Test (org.junit.Test)171 IntWritable (org.apache.hadoop.io.IntWritable)102 Path (org.apache.hadoop.fs.Path)99 BytesWritable (org.apache.hadoop.io.BytesWritable)70 FloatWritable (org.apache.hadoop.io.FloatWritable)68 Configuration (org.apache.hadoop.conf.Configuration)62 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)62 BooleanWritable (org.apache.hadoop.io.BooleanWritable)60 ArrayList (java.util.ArrayList)59 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)57 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)53 IOException (java.io.IOException)49 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)48 SequenceFile (org.apache.hadoop.io.SequenceFile)42 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)40 FileSystem (org.apache.hadoop.fs.FileSystem)37 JobConf (org.apache.hadoop.mapred.JobConf)37 DeferredObject (org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject)35