use of edu.iu.dsc.tws.task.dataobjects.DataObjectSink in project twister2 by DSC-SPIDAL.
the class TaskWorkerDataLoader method execute.
@Override
public void execute() {
getParams();
ComputeGraphBuilder computeGraphBuilder = ComputeGraphBuilder.newBuilder(config);
DataObjectSource sourceTask = new DataObjectSource(Context.TWISTER2_DIRECT_EDGE, dataSource);
DataObjectSink sinkTask = new DataObjectSink();
computeGraphBuilder.addSource("datapointsource", sourceTask, parallelism);
ComputeConnection firstGraphComputeConnection = computeGraphBuilder.addCompute("datapointsink", sinkTask, parallelism);
firstGraphComputeConnection.direct("datapointsource").viaEdge(Context.TWISTER2_DIRECT_EDGE).withDataType(MessageTypes.OBJECT);
computeGraphBuilder.setMode(OperationMode.BATCH);
ComputeGraph datapointsTaskGraph = computeGraphBuilder.build();
ExecutionPlan firstGraphExecutionPlan = taskExecutor.plan(datapointsTaskGraph);
taskExecutor.execute(datapointsTaskGraph, firstGraphExecutionPlan);
DataObject<Object> dataPointsObject = taskExecutor.getOutput(datapointsTaskGraph, firstGraphExecutionPlan, "datapointsink");
LOG.info("Total Partitions : " + dataPointsObject.getPartitions().length);
showAllUnits(dataPointsObject);
}
use of edu.iu.dsc.tws.task.dataobjects.DataObjectSink in project twister2 by DSC-SPIDAL.
the class SvmSgdAdvancedRunner method executeTestingDataLoadingTaskGraph.
/**
* This method loads the testing data
* The loaded test data is used to evaluate the trained data
* Testing data is loaded in parallel depending on the parallelism parameter given
* There are partitions created equal to the parallelism
* Later this will be used to do the testing in parallel in the testing task graph
*
* @return twister2 DataObject containing the testing data
*/
public DataObject<Object> executeTestingDataLoadingTaskGraph() {
DataObject<Object> data = null;
final String TEST_DATA_LOAD_EDGE_DIRECT = "direct2";
DataObjectSource sourceTask1 = new DataObjectSource(TEST_DATA_LOAD_EDGE_DIRECT, this.svmJobParameters.getTestingDataDir());
DataObjectSink sinkTask1 = new DataObjectSink();
testingBuilder.addSource(Constants.SimpleGraphConfig.DATA_OBJECT_SOURCE_TESTING, sourceTask1, dataStreamerParallelism);
ComputeConnection firstGraphComputeConnection1 = testingBuilder.addCompute(Constants.SimpleGraphConfig.DATA_OBJECT_SINK_TESTING, sinkTask1, dataStreamerParallelism);
firstGraphComputeConnection1.direct(Constants.SimpleGraphConfig.DATA_OBJECT_SOURCE_TESTING).viaEdge(TEST_DATA_LOAD_EDGE_DIRECT).withDataType(MessageTypes.OBJECT);
testingBuilder.setMode(OperationMode.BATCH);
ComputeGraph datapointsTaskGraph1 = testingBuilder.build();
datapointsTaskGraph1.setGraphName("testing-data-loading-graph");
ExecutionPlan firstGraphExecutionPlan1 = taskExecutor.plan(datapointsTaskGraph1);
taskExecutor.execute(datapointsTaskGraph1, firstGraphExecutionPlan1);
data = taskExecutor.getOutput(datapointsTaskGraph1, firstGraphExecutionPlan1, Constants.SimpleGraphConfig.DATA_OBJECT_SINK_TESTING);
if (data == null) {
throw new NullPointerException("Something Went Wrong in Loading Testing Data");
} else {
LOG.info("Testing Data Total Partitions : " + data.getPartitions().length);
}
return data;
}
use of edu.iu.dsc.tws.task.dataobjects.DataObjectSink in project twister2 by DSC-SPIDAL.
the class SvmSgdAdvancedRunner method executeTrainingDataLoadingTaskGraph.
/**
* This method loads the training data in a distributed mode
* dataStreamerParallelism is the amount of parallelism used
* in loaded the data in parallel.
*
* @return twister2 DataObject containing the training data
*/
public DataObject<Object> executeTrainingDataLoadingTaskGraph() {
DataObject<Object> data = null;
DataObjectSource sourceTask = new DataObjectSource(Context.TWISTER2_DIRECT_EDGE, this.svmJobParameters.getTrainingDataDir());
DataObjectSink sinkTask = new DataObjectSink();
trainingBuilder.addSource(Constants.SimpleGraphConfig.DATA_OBJECT_SOURCE, sourceTask, dataStreamerParallelism);
ComputeConnection firstGraphComputeConnection = trainingBuilder.addCompute(Constants.SimpleGraphConfig.DATA_OBJECT_SINK, sinkTask, dataStreamerParallelism);
firstGraphComputeConnection.direct(Constants.SimpleGraphConfig.DATA_OBJECT_SOURCE).viaEdge(Context.TWISTER2_DIRECT_EDGE).withDataType(MessageTypes.OBJECT);
trainingBuilder.setMode(OperationMode.BATCH);
ComputeGraph datapointsTaskGraph = trainingBuilder.build();
datapointsTaskGraph.setGraphName("training-data-loading-graph");
ExecutionPlan firstGraphExecutionPlan = taskExecutor.plan(datapointsTaskGraph);
taskExecutor.execute(datapointsTaskGraph, firstGraphExecutionPlan);
data = taskExecutor.getOutput(datapointsTaskGraph, firstGraphExecutionPlan, Constants.SimpleGraphConfig.DATA_OBJECT_SINK);
if (data == null) {
throw new NullPointerException("Something Went Wrong in Loading Training Data");
} else {
LOG.info("Training Data Total Partitions : " + data.getPartitions().length);
}
return data;
}
use of edu.iu.dsc.tws.task.dataobjects.DataObjectSink in project twister2 by DSC-SPIDAL.
the class SvmSgdAdvancedRunner method executeWeightVectorLoadingTaskGraph.
/**
* This method loads the training data in a distributed mode
* dataStreamerParallelism is the amount of parallelism used
* in loaded the data in parallel.
*
* @return twister2 DataObject containing the training data
*/
public DataObject<Object> executeWeightVectorLoadingTaskGraph() {
DataObject<Object> data = null;
DataObjectSource sourceTask = new DataObjectSource(Context.TWISTER2_DIRECT_EDGE, this.svmJobParameters.getWeightVectorDataDir());
DataObjectSink sinkTask = new DataObjectSink();
trainingBuilder.addSource(Constants.SimpleGraphConfig.DATA_OBJECT_SOURCE, sourceTask, dataStreamerParallelism);
ComputeConnection firstGraphComputeConnection = trainingBuilder.addCompute(Constants.SimpleGraphConfig.DATA_OBJECT_SINK, sinkTask, dataStreamerParallelism);
firstGraphComputeConnection.direct(Constants.SimpleGraphConfig.DATA_OBJECT_SOURCE).viaEdge(Context.TWISTER2_DIRECT_EDGE).withDataType(MessageTypes.OBJECT);
trainingBuilder.setMode(OperationMode.BATCH);
ComputeGraph datapointsTaskGraph = trainingBuilder.build();
datapointsTaskGraph.setGraphName("weight-vector-loading-graph");
ExecutionPlan firstGraphExecutionPlan = taskExecutor.plan(datapointsTaskGraph);
taskExecutor.execute(datapointsTaskGraph, firstGraphExecutionPlan);
data = taskExecutor.getOutput(datapointsTaskGraph, firstGraphExecutionPlan, Constants.SimpleGraphConfig.DATA_OBJECT_SINK);
if (data == null) {
throw new NullPointerException("Something Went Wrong in Loading Weight Vector");
} else {
LOG.info("Training Data Total Partitions : " + data.getPartitions().length);
}
return data;
}
use of edu.iu.dsc.tws.task.dataobjects.DataObjectSink in project twister2 by DSC-SPIDAL.
the class KMeansDataGeneratorTest method testUniqueSchedules1.
@Test
public void testUniqueSchedules1() throws IOException {
Config config = getConfig();
String dinputDirectory = "/tmp/testdinput";
int numFiles = 1;
int dsize = 20;
int dimension = 2;
int parallelismValue = 2;
KMeansDataGenerator.generateData("txt", new Path(dinputDirectory), numFiles, dsize, 100, dimension, config);
ComputeGraphBuilder computeGraphBuilder = ComputeGraphBuilder.newBuilder(config);
computeGraphBuilder.setTaskGraphName("kmeans");
DataObjectSource sourceTask = new DataObjectSource("direct", dinputDirectory);
DataObjectSink sinkTask = new DataObjectSink();
computeGraphBuilder.addSource("source", sourceTask, parallelismValue);
ComputeConnection computeConnection1 = computeGraphBuilder.addCompute("sink", sinkTask, parallelismValue);
computeConnection1.direct("source").viaEdge("direct").withDataType(MessageTypes.OBJECT);
computeGraphBuilder.setMode(OperationMode.BATCH);
LocalTextInputPartitioner localTextInputPartitioner = new LocalTextInputPartitioner(new Path(dinputDirectory), parallelismValue, config);
DataSource<String, ?> source = new DataSource<>(config, localTextInputPartitioner, parallelismValue);
InputSplit<String> inputSplit;
for (int i = 0; i < parallelismValue; i++) {
inputSplit = source.getNextSplit(i);
Assert.assertNotNull(inputSplit);
}
}
Aggregations