use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.
the class TfMetaUtils method convertToTransformMetaDataFrame.
/**
* Converts transform meta data into an in-memory FrameBlock object.
*
* @param rows number of rows
* @param colnames column names
* @param rcIDs recode IDs
* @param binIDs binning IDs
* @param meta ?
* @param mvmeta ?
* @return frame block
* @throws IOException if IOException occurs
*/
private static FrameBlock convertToTransformMetaDataFrame(int rows, String[] colnames, List<Integer> rcIDs, List<Integer> binIDs, HashMap<String, String> meta, HashMap<String, String> mvmeta) throws IOException {
// create frame block w/ pure string schema
ValueType[] schema = UtilFunctions.nCopies(colnames.length, ValueType.STRING);
FrameBlock ret = new FrameBlock(schema, colnames);
ret.ensureAllocatedColumns(rows);
// encode recode maps (recoding/dummycoding) into frame
for (Integer colID : rcIDs) {
String name = colnames[colID - 1];
String map = meta.get(name);
if (map == null)
throw new IOException("Recode map for column '" + name + "' (id=" + colID + ") not existing.");
InputStream is = new ByteArrayInputStream(map.getBytes("UTF-8"));
BufferedReader br = new BufferedReader(new InputStreamReader(is));
Pair<String, String> pair = new Pair<>();
String line;
int rpos = 0;
while ((line = br.readLine()) != null) {
DecoderRecode.parseRecodeMapEntry(line, pair);
String tmp = pair.getKey() + Lop.DATATYPE_PREFIX + pair.getValue();
ret.set(rpos++, colID - 1, tmp);
}
ret.getColumnMetadata(colID - 1).setNumDistinct((long) rpos);
}
// encode bin maps (binning) into frame
for (Integer colID : binIDs) {
String name = colnames[colID - 1];
String map = meta.get(name);
if (map == null)
throw new IOException("Binning map for column '" + name + "' (id=" + colID + ") not existing.");
String[] fields = map.split(TfUtils.TXMTD_SEP);
double min = UtilFunctions.parseToDouble(fields[1]);
double binwidth = UtilFunctions.parseToDouble(fields[3]);
int nbins = UtilFunctions.parseToInt(fields[4]);
// materialize bins to support equi-width/equi-height
for (int i = 0; i < nbins; i++) {
String lbound = String.valueOf(min + i * binwidth);
String ubound = String.valueOf(min + (i + 1) * binwidth);
ret.set(i, colID - 1, lbound + Lop.DATATYPE_PREFIX + ubound);
}
ret.getColumnMetadata(colID - 1).setNumDistinct((long) nbins);
}
// encode impute meta data into frame
for (Entry<String, String> e : mvmeta.entrySet()) {
int colID = ArrayUtils.indexOf(colnames, e.getKey()) + 1;
String mvVal = e.getValue().split(TfUtils.TXMTD_SEP)[1];
ret.getColumnMetadata(colID - 1).setMvValue(mvVal);
}
return ret;
}
use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.
the class DataConverter method convertToFrameBlock.
public static FrameBlock convertToFrameBlock(MatrixBlock mb, ValueType[] schema) {
FrameBlock frame = new FrameBlock(schema);
Object[] row = new Object[mb.getNumColumns()];
if (// SPARSE
mb.isInSparseFormat()) {
SparseBlock sblock = mb.getSparseBlock();
for (int i = 0; i < mb.getNumRows(); i++) {
// reset
Arrays.fill(row, null);
if (sblock != null && !sblock.isEmpty(i)) {
int apos = sblock.pos(i);
int alen = sblock.size(i);
int[] aix = sblock.indexes(i);
double[] aval = sblock.values(i);
for (int j = apos; j < apos + alen; j++) {
row[aix[j]] = UtilFunctions.doubleToObject(schema[aix[j]], aval[j]);
}
}
frame.appendRow(row);
}
} else // DENSE
{
int dFreq = UtilFunctions.frequency(schema, ValueType.DOUBLE);
if (schema.length == 1 && dFreq == 1 && mb.isAllocated()) {
// special case double schema and single columns which
// allows for a shallow copy since the physical representation
// of row-major matrix and column-major frame match exactly
frame.reset();
frame.appendColumns(new double[][] { mb.getDenseBlockValues() });
} else if (dFreq == schema.length) {
// special case double schema (without cell-object creation,
// col pre-allocation, and cache-friendly row-column copy)
int m = mb.getNumRows();
int n = mb.getNumColumns();
double[] a = mb.getDenseBlockValues();
double[][] c = new double[n][m];
// blocks of a/c+overhead in L1 cache
int blocksizeIJ = 16;
if (!mb.isEmptyBlock(false))
for (int bi = 0; bi < m; bi += blocksizeIJ) for (int bj = 0; bj < n; bj += blocksizeIJ) {
int bimin = Math.min(bi + blocksizeIJ, m);
int bjmin = Math.min(bj + blocksizeIJ, n);
for (int i = bi, aix = bi * n; i < bimin; i++, aix += n) for (int j = bj; j < bjmin; j++) c[j][i] = a[aix + j];
}
frame.reset();
frame.appendColumns(c);
} else {
// general case
for (int i = 0; i < mb.getNumRows(); i++) {
for (int j = 0; j < mb.getNumColumns(); j++) {
row[j] = UtilFunctions.doubleToObject(schema[j], mb.quickGetValue(i, j));
}
frame.appendRow(row);
}
}
}
return frame;
}
use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.
the class MLContextConversionUtil method dataFrameToFrameObject.
/**
* Convert a {@code DataFrame} to a {@code FrameObject}.
*
* @param variableName
* name of the variable associated with the frame
* @param dataFrame
* the Spark {@code DataFrame}
* @param frameMetadata
* the frame metadata
* @return the {@code DataFrame} frame converted to a converted to a
* {@code FrameObject}
*/
public static FrameObject dataFrameToFrameObject(String variableName, Dataset<Row> dataFrame, FrameMetadata frameMetadata) {
try {
//setup meta data and java spark context
if (frameMetadata == null)
frameMetadata = new FrameMetadata();
determineFrameFormatIfNeeded(dataFrame, frameMetadata);
boolean containsID = isDataFrameWithIDColumn(frameMetadata);
MatrixCharacteristics mc = frameMetadata.asMatrixCharacteristics();
if (mc == null)
mc = new MatrixCharacteristics();
//convert data frame and obtain column names / schema
//TODO extend frame schema by column names (right now dropped)
Pair<String[], ValueType[]> ret = new Pair<String[], ValueType[]>();
JavaPairRDD<Long, FrameBlock> binaryBlock = FrameRDDConverterUtils.dataFrameToBinaryBlock(jsc(), dataFrame, mc, containsID, ret);
frameMetadata.setFrameSchema(new FrameSchema(Arrays.asList(ret.getValue())));
//required due to meta data copy
frameMetadata.setMatrixCharacteristics(mc);
return MLContextConversionUtil.binaryBlocksToFrameObject(variableName, binaryBlock, frameMetadata);
} catch (DMLRuntimeException e) {
throw new MLContextException("Exception converting DataFrame to FrameObject", e);
}
}
use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.
the class MLContextConversionUtil method javaRDDStringIJVToFrameObject.
/**
* Convert a {@code JavaRDD<String>} in IJV format to a {@code FrameObject}
* . Note that metadata is required for IJV format.
*
* @param variableName
* name of the variable associated with the frame
* @param javaRDD
* the Java RDD of strings
* @param frameMetadata
* frame metadata
* @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
*/
public static FrameObject javaRDDStringIJVToFrameObject(String variableName, JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
JavaPairRDD<Long, FrameBlock> rdd;
try {
ValueType[] lschema = null;
if (lschema == null)
lschema = UtilFunctions.nCopies((int) mc.getCols(), ValueType.STRING);
rdd = FrameRDDConverterUtils.textCellToBinaryBlock(jsc(), javaPairRDDText, mc, lschema);
} catch (DMLRuntimeException e) {
e.printStackTrace();
return null;
}
frameObject.setRDDHandle(new RDDObject(rdd, variableName));
return frameObject;
}
use of org.apache.sysml.runtime.matrix.data.FrameBlock in project incubator-systemml by apache.
the class TransformReadMetaTest method runTransformReadMetaTest.
/**
*
* @param sparseM1
* @param sparseM2
* @param instType
* @throws IOException
* @throws DMLRuntimeException
*/
private void runTransformReadMetaTest(RUNTIME_PLATFORM rt, String ofmt, String delim) throws IOException, DMLRuntimeException {
RUNTIME_PLATFORM platformOld = rtplatform;
rtplatform = rt;
boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
if (rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
try {
String testname = delim.equals(",") ? TEST_NAME1 : TEST_NAME2;
getAndLoadTestConfiguration(testname);
//generate input data
double[][] X = DataConverter.convertToDoubleMatrix(MatrixBlock.seqOperations(0.5, rows / 2, 0.5).appendOperations(MatrixBlock.seqOperations(0.5, rows / 2, 0.5), new MatrixBlock()));
MatrixBlock mbX = DataConverter.convertToMatrixBlock(X);
CSVFileFormatProperties fprops = new CSVFileFormatProperties(false, delim, false);
MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(OutputInfo.CSVOutputInfo, 1, fprops);
writer.writeMatrixToHDFS(mbX, input("X"), rows, 2, -1, -1, -1);
//read specs transform X and Y
String specX = MapReduceTool.readStringFromHDFSFile(SCRIPT_DIR + TEST_DIR + SPEC_X);
fullDMLScriptName = SCRIPT_DIR + TEST_DIR + testname + ".dml";
programArgs = new String[] { "-args", input("X"), specX, output("M1"), output("M"), ofmt, delim };
//run test
runTest(true, false, null, -1);
//compare meta data frames
InputInfo iinfo = InputInfo.stringExternalToInputInfo(ofmt);
FrameReader reader = FrameReaderFactory.createFrameReader(iinfo);
FrameBlock mExpected = TfMetaUtils.readTransformMetaDataFromFile(specX, output("M1"), delim);
FrameBlock mRet = reader.readFrameFromHDFS(output("M"), rows, 2);
for (int i = 0; i < rows; i++) for (int j = 0; j < 2; j++) {
Assert.assertTrue("Wrong result: " + mRet.get(i, j) + ".", UtilFunctions.compareTo(ValueType.STRING, mExpected.get(i, j), mRet.get(i, j)) == 0);
}
} catch (Exception ex) {
throw new IOException(ex);
} finally {
rtplatform = platformOld;
DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
}
}
Aggregations