use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.
the class Connection method readDoubleMatrix.
// //////////////////////////////////////////
// Read matrices
// //////////////////////////////////////////
/**
* Reads an input matrix in arbitrary format from HDFS into a dense double array.
* NOTE: this call currently only supports default configurations for CSV.
*
* @param fname the filename of the input matrix
* @return matrix as a two-dimensional double array
* @throws IOException if IOException occurs
*/
public double[][] readDoubleMatrix(String fname) throws IOException {
try {
// read json meta data
String fnamemtd = DataExpression.getMTDFileName(fname);
JSONObject jmtd = new DataExpression().readMetadataFile(fnamemtd, false);
// parse json meta data
long rows = jmtd.getLong(DataExpression.READROWPARAM);
long cols = jmtd.getLong(DataExpression.READCOLPARAM);
int brlen = jmtd.containsKey(DataExpression.ROWBLOCKCOUNTPARAM) ? jmtd.getInt(DataExpression.ROWBLOCKCOUNTPARAM) : -1;
int bclen = jmtd.containsKey(DataExpression.COLUMNBLOCKCOUNTPARAM) ? jmtd.getInt(DataExpression.COLUMNBLOCKCOUNTPARAM) : -1;
long nnz = jmtd.containsKey(DataExpression.READNUMNONZEROPARAM) ? jmtd.getLong(DataExpression.READNUMNONZEROPARAM) : -1;
String format = jmtd.getString(DataExpression.FORMAT_TYPE);
InputInfo iinfo = InputInfo.stringExternalToInputInfo(format);
// read matrix file
return readDoubleMatrix(fname, iinfo, rows, cols, brlen, bclen, nnz);
} catch (Exception ex) {
throw new IOException(ex);
}
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.
the class Connection method convertToMatrix.
/**
* Converts an input stream of a string matrix in csv or textcell format
* into a matrix block.
*
* @param input InputStream to a string matrix in csv or textcell format
* @param rows number of rows in the matrix
* @param cols number of columns in the matrix
* @param format input format of the given stream
* @return matrix as a matrix block
* @throws IOException if IOException occurs
*/
public MatrixBlock convertToMatrix(InputStream input, int rows, int cols, String format) throws IOException {
MatrixBlock ret = null;
// sanity check input format
if (!(DataExpression.FORMAT_TYPE_VALUE_TEXT.equals(format) || DataExpression.FORMAT_TYPE_VALUE_MATRIXMARKET.equals(format) || DataExpression.FORMAT_TYPE_VALUE_CSV.equals(format))) {
throw new IOException("Invalid input format (expected: csv, text or mm): " + format);
}
setLocalConfigs();
try {
// read input matrix
InputInfo iinfo = DataExpression.FORMAT_TYPE_VALUE_CSV.equals(format) ? InputInfo.CSVInputInfo : InputInfo.TextCellInputInfo;
MatrixReader reader = MatrixReaderFactory.createMatrixReader(iinfo);
int blksz = ConfigurationManager.getBlocksize();
ret = reader.readMatrixFromInputStream(input, rows, cols, blksz, blksz, (long) rows * cols);
} catch (DMLRuntimeException rex) {
throw new IOException(rex);
}
return ret;
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.
the class Connection method readStringFrame.
// //////////////////////////////////////////
// Read frames
// //////////////////////////////////////////
/**
* Reads an input frame in arbitrary format from HDFS into a dense string array.
* NOTE: this call currently only supports default configurations for CSV.
*
* @param fname the filename of the input frame
* @return frame as a two-dimensional string array
* @throws IOException if IOException occurs
*/
public String[][] readStringFrame(String fname) throws IOException {
try {
// read json meta data
String fnamemtd = DataExpression.getMTDFileName(fname);
JSONObject jmtd = new DataExpression().readMetadataFile(fnamemtd, false);
// parse json meta data
long rows = jmtd.getLong(DataExpression.READROWPARAM);
long cols = jmtd.getLong(DataExpression.READCOLPARAM);
String format = jmtd.getString(DataExpression.FORMAT_TYPE);
InputInfo iinfo = InputInfo.stringExternalToInputInfo(format);
// read frame file
return readStringFrame(fname, iinfo, rows, cols);
} catch (Exception ex) {
throw new IOException(ex);
}
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.
the class ReblockSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// set the output characteristics
CacheableData<?> obj = sec.getCacheableData(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
mcOut.set(mc.getRows(), mc.getCols(), brlen, bclen, mc.getNonZeros());
// get the source format form the meta data
MetaDataFormat iimd = (MetaDataFormat) obj.getMetaData();
if (iimd == null)
throw new DMLRuntimeException("Error: Metadata not found");
InputInfo iinfo = iimd.getInputInfo();
// check for in-memory reblock (w/ lazy spark context, potential for latency reduction)
if (Recompiler.checkCPReblock(sec, input1.getName())) {
if (input1.getDataType() == DataType.MATRIX)
Recompiler.executeInMemoryMatrixReblock(sec, input1.getName(), output.getName());
else if (input1.getDataType() == DataType.FRAME)
Recompiler.executeInMemoryFrameReblock(sec, input1.getName(), output.getName());
return;
}
// execute matrix/frame reblock
if (input1.getDataType() == DataType.MATRIX)
processMatrixReblockInstruction(sec, iinfo);
else if (input1.getDataType() == DataType.FRAME)
processFrameReblockInstruction(sec, iinfo);
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project incubator-systemml by apache.
the class RemoteDPParForSpark method getPartitionedInput.
@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
InputInfo ii = InputInfo.BinaryBlockInputInfo;
MatrixObject mo = sec.getMatrixObject(matrixvar);
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
// NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
if (hasInputDataSet(dpf, mo)) {
DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
Dataset<Row> in = dsObj.getDataset();
// construct or reuse row ids
JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
in.javaRDD().zipWithIndex();
// convert row to row in matrix block format
return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
} else // binary block input rdd without grouping
if (!requiresGrouping(dpf, mo)) {
// get input rdd and data partitioning
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
} else // default binary block input rdd with grouping
{
// get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
// to reduce memory pressure for shuffle and subsequent
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
// data partitioning of input rdd
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
}
}
Aggregations