use of org.apache.hadoop.io.LongWritable in project systemml by apache.
the class MLContextConversionUtil method javaRDDStringIJVToFrameObject.
/**
* Convert a {@code JavaRDD<String>} in IJV format to a {@code FrameObject}
* . Note that metadata is required for IJV format.
*
* @param javaRDD
* the Java RDD of strings
* @param frameMetadata
* frame metadata
* @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
*/
public static FrameObject javaRDDStringIJVToFrameObject(JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
JavaPairRDD<Long, FrameBlock> rdd;
try {
ValueType[] lschema = null;
if (lschema == null)
lschema = UtilFunctions.nCopies((int) mc.getCols(), ValueType.STRING);
rdd = FrameRDDConverterUtils.textCellToBinaryBlock(jsc(), javaPairRDDText, mc, lschema);
} catch (DMLRuntimeException e) {
e.printStackTrace();
return null;
}
frameObject.setRDDHandle(new RDDObject(rdd));
return frameObject;
}
use of org.apache.hadoop.io.LongWritable in project systemml by apache.
the class FrameTest method testFrameGeneral.
private void testFrameGeneral(InputInfo iinfo, OutputInfo oinfo, boolean bFromDataFrame, boolean bToDataFrame) throws IOException, DMLException, ParseException {
boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
RUNTIME_PLATFORM oldRT = DMLScript.rtplatform;
DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
int rowstart = 234, rowend = 1478, colstart = 125, colend = 568;
int bRows = rowend - rowstart + 1, bCols = colend - colstart + 1;
int rowstartC = 124, rowendC = 1178, colstartC = 143, colendC = 368;
int cRows = rowendC - rowstartC + 1, cCols = colendC - colstartC + 1;
HashMap<String, ValueType[]> outputSchema = new HashMap<String, ValueType[]>();
HashMap<String, MatrixCharacteristics> outputMC = new HashMap<String, MatrixCharacteristics>();
TestConfiguration config = getTestConfiguration(TEST_NAME);
loadTestConfiguration(config);
List<String> proArgs = new ArrayList<String>();
proArgs.add(input("A"));
proArgs.add(Integer.toString(rows));
proArgs.add(Integer.toString(cols));
proArgs.add(input("B"));
proArgs.add(Integer.toString(bRows));
proArgs.add(Integer.toString(bCols));
proArgs.add(Integer.toString(rowstart));
proArgs.add(Integer.toString(rowend));
proArgs.add(Integer.toString(colstart));
proArgs.add(Integer.toString(colend));
proArgs.add(output("A"));
proArgs.add(Integer.toString(rowstartC));
proArgs.add(Integer.toString(rowendC));
proArgs.add(Integer.toString(colstartC));
proArgs.add(Integer.toString(colendC));
proArgs.add(output("C"));
fullDMLScriptName = SCRIPT_DIR + TEST_DIR + TEST_NAME + ".dml";
ValueType[] schema = schemaMixedLarge;
// initialize the frame data.
List<ValueType> lschema = Arrays.asList(schema);
fullRScriptName = SCRIPT_DIR + TEST_DIR + TEST_NAME + ".R";
rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + rowstart + " " + rowend + " " + colstart + " " + colend + " " + expectedDir() + " " + rowstartC + " " + rowendC + " " + colstartC + " " + colendC;
double sparsity = sparsity1;
double[][] A = getRandomMatrix(rows, cols, min, max, sparsity, 1111);
writeInputFrameWithMTD("A", A, true, schema, oinfo);
sparsity = sparsity2;
double[][] B = getRandomMatrix((int) (bRows), (int) (bCols), min, max, sparsity, 2345);
ValueType[] schemaB = new ValueType[bCols];
for (int i = 0; i < bCols; ++i) schemaB[i] = schema[colstart - 1 + i];
List<ValueType> lschemaB = Arrays.asList(schemaB);
writeInputFrameWithMTD("B", B, true, schemaB, oinfo);
ValueType[] schemaC = new ValueType[colendC - colstartC + 1];
for (int i = 0; i < cCols; ++i) schemaC[i] = schema[colstartC - 1 + i];
Dataset<Row> dfA = null, dfB = null;
if (bFromDataFrame) {
// Create DataFrame for input A
StructType dfSchemaA = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schema, false);
JavaRDD<Row> rowRDDA = FrameRDDConverterUtils.csvToRowRDD(sc, input("A"), DataExpression.DEFAULT_DELIM_DELIMITER, schema);
dfA = spark.createDataFrame(rowRDDA, dfSchemaA);
// Create DataFrame for input B
StructType dfSchemaB = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaB, false);
JavaRDD<Row> rowRDDB = FrameRDDConverterUtils.csvToRowRDD(sc, input("B"), DataExpression.DEFAULT_DELIM_DELIMITER, schemaB);
dfB = spark.createDataFrame(rowRDDB, dfSchemaB);
}
try {
Script script = ScriptFactory.dmlFromFile(fullDMLScriptName);
String format = "csv";
if (oinfo == OutputInfo.TextCellOutputInfo)
format = "text";
if (bFromDataFrame) {
script.in("A", dfA);
} else {
JavaRDD<String> aIn = sc.textFile(input("A"));
FrameSchema fs = new FrameSchema(lschema);
FrameFormat ff = (format.equals("text")) ? FrameFormat.IJV : FrameFormat.CSV;
FrameMetadata fm = new FrameMetadata(ff, fs, rows, cols);
script.in("A", aIn, fm);
}
if (bFromDataFrame) {
script.in("B", dfB);
} else {
JavaRDD<String> bIn = sc.textFile(input("B"));
FrameSchema fs = new FrameSchema(lschemaB);
FrameFormat ff = (format.equals("text")) ? FrameFormat.IJV : FrameFormat.CSV;
FrameMetadata fm = new FrameMetadata(ff, fs, bRows, bCols);
script.in("B", bIn, fm);
}
// Output one frame to HDFS and get one as RDD //TODO HDFS input/output to do
script.out("A", "C");
// set positional argument values
for (int argNum = 1; argNum <= proArgs.size(); argNum++) {
script.in("$" + argNum, proArgs.get(argNum - 1));
}
MLResults results = ml.execute(script);
format = "csv";
if (iinfo == InputInfo.TextCellInputInfo)
format = "text";
String fName = output("AB");
try {
MapReduceTool.deleteFileIfExistOnHDFS(fName);
} catch (IOException e) {
throw new DMLRuntimeException("Error: While deleting file on HDFS");
}
if (!bToDataFrame) {
if (format.equals("text")) {
JavaRDD<String> javaRDDStringIJV = results.getJavaRDDStringIJV("A");
javaRDDStringIJV.saveAsTextFile(fName);
} else {
JavaRDD<String> javaRDDStringCSV = results.getJavaRDDStringCSV("A");
javaRDDStringCSV.saveAsTextFile(fName);
}
} else {
Dataset<Row> df = results.getDataFrame("A");
// Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
MatrixCharacteristics mc = new MatrixCharacteristics(rows, cols, -1, -1, -1);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, bFromDataFrame).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(output("AB"), LongWritable.class, FrameBlock.class, OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
}
fName = output("C");
try {
MapReduceTool.deleteFileIfExistOnHDFS(fName);
} catch (IOException e) {
throw new DMLRuntimeException("Error: While deleting file on HDFS");
}
if (!bToDataFrame) {
if (format.equals("text")) {
JavaRDD<String> javaRDDStringIJV = results.getJavaRDDStringIJV("C");
javaRDDStringIJV.saveAsTextFile(fName);
} else {
JavaRDD<String> javaRDDStringCSV = results.getJavaRDDStringCSV("C");
javaRDDStringCSV.saveAsTextFile(fName);
}
} else {
Dataset<Row> df = results.getDataFrame("C");
// Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
MatrixCharacteristics mc = new MatrixCharacteristics(cRows, cCols, -1, -1, -1);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, bFromDataFrame).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fName, LongWritable.class, FrameBlock.class, OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
}
runRScript(true);
outputSchema.put("AB", schema);
outputMC.put("AB", new MatrixCharacteristics(rows, cols, -1, -1));
outputSchema.put("C", schemaC);
outputMC.put("C", new MatrixCharacteristics(cRows, cCols, -1, -1));
for (String file : config.getOutputFiles()) {
MatrixCharacteristics md = outputMC.get(file);
FrameBlock frameBlock = readDMLFrameFromHDFS(file, iinfo, md);
FrameBlock frameRBlock = readRFrameFromHDFS(file + ".csv", InputInfo.CSVInputInfo, md);
ValueType[] schemaOut = outputSchema.get(file);
verifyFrameData(frameBlock, frameRBlock, schemaOut);
System.out.println("File " + file + " processed successfully.");
}
System.out.println("Frame MLContext test completed successfully.");
} finally {
DMLScript.rtplatform = oldRT;
DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
}
}
use of org.apache.hadoop.io.LongWritable in project systemml by apache.
the class FrameReaderBinaryBlock method readBinaryBlockFrameFromSequenceFile.
@SuppressWarnings({ "deprecation" })
protected static void readBinaryBlockFrameFromSequenceFile(Path path, JobConf job, FileSystem fs, FrameBlock dest) throws IOException, DMLRuntimeException {
int rlen = dest.getNumRows();
int clen = dest.getNumColumns();
// directly read from sequence files (individual partfiles)
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, job);
LongWritable key = new LongWritable(-1L);
FrameBlock value = new FrameBlock();
try {
while (reader.next(key, value)) {
int row_offset = (int) (key.get() - 1);
int rows = value.getNumRows();
int cols = value.getNumColumns();
if (// Empty block, ignore it.
rows == 0 || cols == 0)
continue;
// bound check per block
if (row_offset + rows < 0 || row_offset + rows > rlen) {
throw new IOException("Frame block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + ":" + "] " + "out of overall frame range [1:" + rlen + ",1:" + clen + "].");
}
// copy block into target frame, incl meta on first
dest.copy(row_offset, row_offset + rows - 1, 0, cols - 1, value);
if (row_offset == 0)
dest.setColumnMetadata(value.getColumnMetadata());
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
use of org.apache.hadoop.io.LongWritable in project systemml by apache.
the class FrameReaderTextCSV method computeCSVSize.
protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
TextInputFormat informat = new TextInputFormat();
informat.configure(job);
InputSplit[] splits = informat.getSplits(job, 1);
splits = IOUtilFunctions.sortInputSplits(splits);
// compute number of columns
int ncol = IOUtilFunctions.countNumColumnsCSV(splits, informat, job, _props.getDelim());
// compute number of rows
int nrow = 0;
for (int i = 0; i < splits.length; i++) {
RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[i], job, Reporter.NULL);
LongWritable key = new LongWritable();
Text value = new Text();
try {
// ignore header of first split
if (i == 0 && _props.hasHeader())
reader.next(key, value);
// count remaining number of rows, ignore meta data
while (reader.next(key, value)) {
String val = value.toString();
nrow += (val.startsWith(TfUtils.TXMTD_MVPREFIX) || val.startsWith(TfUtils.TXMTD_NDPREFIX)) ? 0 : 1;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
}
return new Pair<>(nrow, ncol);
}
use of org.apache.hadoop.io.LongWritable in project incubator-gobblin by apache.
the class OldApiHadoopFileInputSourceTest method testGetWorkUnitsAndExtractor.
@Test
public void testGetWorkUnitsAndExtractor() throws IOException, DataRecordException {
OldApiHadoopFileInputSource<String, Text, LongWritable, Text> fileInputSource = new TestHadoopFileInputSource();
List<WorkUnit> workUnitList = fileInputSource.getWorkunits(this.sourceState);
Assert.assertEquals(workUnitList.size(), 1);
WorkUnitState workUnitState = new WorkUnitState(workUnitList.get(0));
Closer closer = Closer.create();
try {
OldApiHadoopFileInputExtractor<String, Text, LongWritable, Text> extractor = (OldApiHadoopFileInputExtractor<String, Text, LongWritable, Text>) fileInputSource.getExtractor(workUnitState);
Text text = extractor.readRecord(null);
Assert.assertEquals(text.toString(), TEXT);
Assert.assertNull(extractor.readRecord(null));
} catch (Throwable t) {
throw closer.rethrow(t);
} finally {
closer.close();
}
}
Aggregations