Search in sources :

Example 86 with LongWritable

use of org.apache.hadoop.io.LongWritable in project systemml by apache.

the class MLContextConversionUtil method javaRDDStringIJVToFrameObject.

/**
 * Convert a {@code JavaRDD<String>} in IJV format to a {@code FrameObject}
 * . Note that metadata is required for IJV format.
 *
 * @param javaRDD
 *            the Java RDD of strings
 * @param frameMetadata
 *            frame metadata
 * @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
 */
public static FrameObject javaRDDStringIJVToFrameObject(JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
    JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
    MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
    FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
    JavaPairRDD<Long, FrameBlock> rdd;
    try {
        ValueType[] lschema = null;
        if (lschema == null)
            lschema = UtilFunctions.nCopies((int) mc.getCols(), ValueType.STRING);
        rdd = FrameRDDConverterUtils.textCellToBinaryBlock(jsc(), javaPairRDDText, mc, lschema);
    } catch (DMLRuntimeException e) {
        e.printStackTrace();
        return null;
    }
    frameObject.setRDDHandle(new RDDObject(rdd));
    return frameObject;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) ValueType(org.apache.sysml.parser.Expression.ValueType) Text(org.apache.hadoop.io.Text) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) LongWritable(org.apache.hadoop.io.LongWritable)

Example 87 with LongWritable

use of org.apache.hadoop.io.LongWritable in project systemml by apache.

the class FrameTest method testFrameGeneral.

private void testFrameGeneral(InputInfo iinfo, OutputInfo oinfo, boolean bFromDataFrame, boolean bToDataFrame) throws IOException, DMLException, ParseException {
    boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
    DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    RUNTIME_PLATFORM oldRT = DMLScript.rtplatform;
    DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
    int rowstart = 234, rowend = 1478, colstart = 125, colend = 568;
    int bRows = rowend - rowstart + 1, bCols = colend - colstart + 1;
    int rowstartC = 124, rowendC = 1178, colstartC = 143, colendC = 368;
    int cRows = rowendC - rowstartC + 1, cCols = colendC - colstartC + 1;
    HashMap<String, ValueType[]> outputSchema = new HashMap<String, ValueType[]>();
    HashMap<String, MatrixCharacteristics> outputMC = new HashMap<String, MatrixCharacteristics>();
    TestConfiguration config = getTestConfiguration(TEST_NAME);
    loadTestConfiguration(config);
    List<String> proArgs = new ArrayList<String>();
    proArgs.add(input("A"));
    proArgs.add(Integer.toString(rows));
    proArgs.add(Integer.toString(cols));
    proArgs.add(input("B"));
    proArgs.add(Integer.toString(bRows));
    proArgs.add(Integer.toString(bCols));
    proArgs.add(Integer.toString(rowstart));
    proArgs.add(Integer.toString(rowend));
    proArgs.add(Integer.toString(colstart));
    proArgs.add(Integer.toString(colend));
    proArgs.add(output("A"));
    proArgs.add(Integer.toString(rowstartC));
    proArgs.add(Integer.toString(rowendC));
    proArgs.add(Integer.toString(colstartC));
    proArgs.add(Integer.toString(colendC));
    proArgs.add(output("C"));
    fullDMLScriptName = SCRIPT_DIR + TEST_DIR + TEST_NAME + ".dml";
    ValueType[] schema = schemaMixedLarge;
    // initialize the frame data.
    List<ValueType> lschema = Arrays.asList(schema);
    fullRScriptName = SCRIPT_DIR + TEST_DIR + TEST_NAME + ".R";
    rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + rowstart + " " + rowend + " " + colstart + " " + colend + " " + expectedDir() + " " + rowstartC + " " + rowendC + " " + colstartC + " " + colendC;
    double sparsity = sparsity1;
    double[][] A = getRandomMatrix(rows, cols, min, max, sparsity, 1111);
    writeInputFrameWithMTD("A", A, true, schema, oinfo);
    sparsity = sparsity2;
    double[][] B = getRandomMatrix((int) (bRows), (int) (bCols), min, max, sparsity, 2345);
    ValueType[] schemaB = new ValueType[bCols];
    for (int i = 0; i < bCols; ++i) schemaB[i] = schema[colstart - 1 + i];
    List<ValueType> lschemaB = Arrays.asList(schemaB);
    writeInputFrameWithMTD("B", B, true, schemaB, oinfo);
    ValueType[] schemaC = new ValueType[colendC - colstartC + 1];
    for (int i = 0; i < cCols; ++i) schemaC[i] = schema[colstartC - 1 + i];
    Dataset<Row> dfA = null, dfB = null;
    if (bFromDataFrame) {
        // Create DataFrame for input A
        StructType dfSchemaA = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schema, false);
        JavaRDD<Row> rowRDDA = FrameRDDConverterUtils.csvToRowRDD(sc, input("A"), DataExpression.DEFAULT_DELIM_DELIMITER, schema);
        dfA = spark.createDataFrame(rowRDDA, dfSchemaA);
        // Create DataFrame for input B
        StructType dfSchemaB = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaB, false);
        JavaRDD<Row> rowRDDB = FrameRDDConverterUtils.csvToRowRDD(sc, input("B"), DataExpression.DEFAULT_DELIM_DELIMITER, schemaB);
        dfB = spark.createDataFrame(rowRDDB, dfSchemaB);
    }
    try {
        Script script = ScriptFactory.dmlFromFile(fullDMLScriptName);
        String format = "csv";
        if (oinfo == OutputInfo.TextCellOutputInfo)
            format = "text";
        if (bFromDataFrame) {
            script.in("A", dfA);
        } else {
            JavaRDD<String> aIn = sc.textFile(input("A"));
            FrameSchema fs = new FrameSchema(lschema);
            FrameFormat ff = (format.equals("text")) ? FrameFormat.IJV : FrameFormat.CSV;
            FrameMetadata fm = new FrameMetadata(ff, fs, rows, cols);
            script.in("A", aIn, fm);
        }
        if (bFromDataFrame) {
            script.in("B", dfB);
        } else {
            JavaRDD<String> bIn = sc.textFile(input("B"));
            FrameSchema fs = new FrameSchema(lschemaB);
            FrameFormat ff = (format.equals("text")) ? FrameFormat.IJV : FrameFormat.CSV;
            FrameMetadata fm = new FrameMetadata(ff, fs, bRows, bCols);
            script.in("B", bIn, fm);
        }
        // Output one frame to HDFS and get one as RDD //TODO HDFS input/output to do
        script.out("A", "C");
        // set positional argument values
        for (int argNum = 1; argNum <= proArgs.size(); argNum++) {
            script.in("$" + argNum, proArgs.get(argNum - 1));
        }
        MLResults results = ml.execute(script);
        format = "csv";
        if (iinfo == InputInfo.TextCellInputInfo)
            format = "text";
        String fName = output("AB");
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(fName);
        } catch (IOException e) {
            throw new DMLRuntimeException("Error: While deleting file on HDFS");
        }
        if (!bToDataFrame) {
            if (format.equals("text")) {
                JavaRDD<String> javaRDDStringIJV = results.getJavaRDDStringIJV("A");
                javaRDDStringIJV.saveAsTextFile(fName);
            } else {
                JavaRDD<String> javaRDDStringCSV = results.getJavaRDDStringCSV("A");
                javaRDDStringCSV.saveAsTextFile(fName);
            }
        } else {
            Dataset<Row> df = results.getDataFrame("A");
            // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
            MatrixCharacteristics mc = new MatrixCharacteristics(rows, cols, -1, -1, -1);
            JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, bFromDataFrame).mapToPair(new LongFrameToLongWritableFrameFunction());
            rddOut.saveAsHadoopFile(output("AB"), LongWritable.class, FrameBlock.class, OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
        }
        fName = output("C");
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(fName);
        } catch (IOException e) {
            throw new DMLRuntimeException("Error: While deleting file on HDFS");
        }
        if (!bToDataFrame) {
            if (format.equals("text")) {
                JavaRDD<String> javaRDDStringIJV = results.getJavaRDDStringIJV("C");
                javaRDDStringIJV.saveAsTextFile(fName);
            } else {
                JavaRDD<String> javaRDDStringCSV = results.getJavaRDDStringCSV("C");
                javaRDDStringCSV.saveAsTextFile(fName);
            }
        } else {
            Dataset<Row> df = results.getDataFrame("C");
            // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
            MatrixCharacteristics mc = new MatrixCharacteristics(cRows, cCols, -1, -1, -1);
            JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, bFromDataFrame).mapToPair(new LongFrameToLongWritableFrameFunction());
            rddOut.saveAsHadoopFile(fName, LongWritable.class, FrameBlock.class, OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
        }
        runRScript(true);
        outputSchema.put("AB", schema);
        outputMC.put("AB", new MatrixCharacteristics(rows, cols, -1, -1));
        outputSchema.put("C", schemaC);
        outputMC.put("C", new MatrixCharacteristics(cRows, cCols, -1, -1));
        for (String file : config.getOutputFiles()) {
            MatrixCharacteristics md = outputMC.get(file);
            FrameBlock frameBlock = readDMLFrameFromHDFS(file, iinfo, md);
            FrameBlock frameRBlock = readRFrameFromHDFS(file + ".csv", InputInfo.CSVInputInfo, md);
            ValueType[] schemaOut = outputSchema.get(file);
            verifyFrameData(frameBlock, frameRBlock, schemaOut);
            System.out.println("File " + file + " processed successfully.");
        }
        System.out.println("Frame MLContext test completed successfully.");
    } finally {
        DMLScript.rtplatform = oldRT;
        DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
    }
}
Also used : FrameFormat(org.apache.sysml.api.mlcontext.FrameFormat) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap) MLResults(org.apache.sysml.api.mlcontext.MLResults) TestConfiguration(org.apache.sysml.test.integration.TestConfiguration) ArrayList(java.util.ArrayList) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction) Script(org.apache.sysml.api.mlcontext.Script) DMLScript(org.apache.sysml.api.DMLScript) ValueType(org.apache.sysml.parser.Expression.ValueType) FrameSchema(org.apache.sysml.api.mlcontext.FrameSchema) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) Row(org.apache.spark.sql.Row) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata)

Example 88 with LongWritable

use of org.apache.hadoop.io.LongWritable in project systemml by apache.

the class FrameReaderBinaryBlock method readBinaryBlockFrameFromSequenceFile.

@SuppressWarnings({ "deprecation" })
protected static void readBinaryBlockFrameFromSequenceFile(Path path, JobConf job, FileSystem fs, FrameBlock dest) throws IOException, DMLRuntimeException {
    int rlen = dest.getNumRows();
    int clen = dest.getNumColumns();
    // directly read from sequence files (individual partfiles)
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
    LongWritable key = new LongWritable(-1L);
    FrameBlock value = new FrameBlock();
    try {
        while (reader.next(key, value)) {
            int row_offset = (int) (key.get() - 1);
            int rows = value.getNumRows();
            int cols = value.getNumColumns();
            if (// Empty block, ignore it.
            rows == 0 || cols == 0)
                continue;
            // bound check per block
            if (row_offset + rows < 0 || row_offset + rows > rlen) {
                throw new IOException("Frame block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + ":" + "] " + "out of overall frame range [1:" + rlen + ",1:" + clen + "].");
            }
            // copy block into target frame, incl meta on first
            dest.copy(row_offset, row_offset + rows - 1, 0, cols - 1, value);
            if (row_offset == 0)
                dest.setColumnMetadata(value.getColumnMetadata());
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
}
Also used : SequenceFile(org.apache.hadoop.io.SequenceFile) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) LongWritable(org.apache.hadoop.io.LongWritable) IOException(java.io.IOException)

Example 89 with LongWritable

use of org.apache.hadoop.io.LongWritable in project systemml by apache.

the class FrameReaderTextCSV method computeCSVSize.

protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);
    splits = IOUtilFunctions.sortInputSplits(splits);
    // compute number of columns
    int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
    // compute number of rows
    int nrow = 0;
    for (int i = 0; i < splits.length; i++) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
        LongWritable key = new LongWritable();
        Text value = new Text();
        try {
            // ignore header of first split
            if (i == 0 && _props.hasHeader())
                reader.next(key, value);
            // count remaining number of rows, ignore meta data
            while (reader.next(key, value)) {
                String val = value.toString();
                nrow += (val.startsWith(TfUtils.TXMTD_MVPREFIX) || val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }
    return new Pair<>(nrow, ncol);
}
Also used : TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) InputSplit(org.apache.hadoop.mapred.InputSplit) Pair(org.apache.sysml.runtime.matrix.data.Pair)

Example 90 with LongWritable

use of org.apache.hadoop.io.LongWritable in project incubator-gobblin by apache.

the class OldApiHadoopFileInputSourceTest method testGetWorkUnitsAndExtractor.

@Test
public void testGetWorkUnitsAndExtractor() throws IOException, DataRecordException {
    OldApiHadoopFileInputSource<String, Text, LongWritable, Text> fileInputSource = new TestHadoopFileInputSource();
    List<WorkUnit> workUnitList = fileInputSource.getWorkunits(this.sourceState);
    Assert.assertEquals(workUnitList.size(), 1);
    WorkUnitState workUnitState = new WorkUnitState(workUnitList.get(0));
    Closer closer = Closer.create();
    try {
        OldApiHadoopFileInputExtractor<String, Text, LongWritable, Text> extractor = (OldApiHadoopFileInputExtractor<String, Text, LongWritable, Text>) fileInputSource.getExtractor(workUnitState);
        Text text = extractor.readRecord(null);
        Assert.assertEquals(text.toString(), TEXT);
        Assert.assertNull(extractor.readRecord(null));
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
    }
}
Also used : Closer(com.google.common.io.Closer) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

LongWritable (org.apache.hadoop.io.LongWritable)445 Text (org.apache.hadoop.io.Text)220 Test (org.junit.Test)171 IntWritable (org.apache.hadoop.io.IntWritable)102 Path (org.apache.hadoop.fs.Path)99 BytesWritable (org.apache.hadoop.io.BytesWritable)70 FloatWritable (org.apache.hadoop.io.FloatWritable)68 Configuration (org.apache.hadoop.conf.Configuration)62 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)62 BooleanWritable (org.apache.hadoop.io.BooleanWritable)60 ArrayList (java.util.ArrayList)59 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)57 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)53 IOException (java.io.IOException)49 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)48 SequenceFile (org.apache.hadoop.io.SequenceFile)42 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)40 FileSystem (org.apache.hadoop.fs.FileSystem)37 JobConf (org.apache.hadoop.mapred.JobConf)37 DeferredObject (org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject)35