use of edu.iu.dsc.tws.tset.env.BatchEnvironment in project twister2 by DSC-SPIDAL.
the class FileBasedWordCount method execute.
@Override
public void execute(WorkerEnvironment workerEnv) {
BatchEnvironment env = TSetEnvironment.initBatch(workerEnv);
int sourcePar = (int) env.getConfig().get("PAR");
// read the file line by line by using a single worker
SourceTSet<String> lines = env.createSource(new WordCountFileSource(), 1);
// distribute the lines among the workers and performs a flatmap operation to extract words
ComputeTSet<String> words = lines.partition(new HashingPartitioner<>(), sourcePar).flatmap((FlatMapFunc<String, String>) (l, collector) -> {
StringTokenizer itr = new StringTokenizer(l);
while (itr.hasMoreTokens()) {
collector.collect(itr.nextToken());
}
});
// attach count as 1 for each word
KeyedTSet<String, Integer> groupedWords = words.mapToTuple(w -> new Tuple<>(w, 1));
// performs reduce by key at each worker
KeyedReduceTLink<String, Integer> keyedReduce = groupedWords.keyedReduce(Integer::sum);
// gather the results to worker0 (there is a dummy map op here to pass the values to edges)
// and write to a file
keyedReduce.map(i -> i).gather().forEach(new WordcountFileWriter());
}
use of edu.iu.dsc.tws.tset.env.BatchEnvironment in project twister2 by DSC-SPIDAL.
the class WordCount method execute.
@Override
public void execute(WorkerEnvironment workerEnv) {
BatchEnvironment env = TSetEnvironment.initBatch(workerEnv);
int sourcePar = 4;
Config config = env.getConfig();
// create a source with fixed number of random words
SourceTSet<String> source = env.createSource(new WordGenerator((int) config.get("NO_OF_SAMPLE_WORDS"), (int) config.get("MAX_CHARS")), sourcePar).setName("source");
// map the words to a tuple, with <word, 1>, 1 is the count
KeyedTSet<String, Integer> groupedWords = source.mapToTuple(w -> new Tuple<>(w, 1));
// reduce using the sim operation
KeyedReduceTLink<String, Integer> keyedReduce = groupedWords.keyedReduce(Integer::sum);
// print the counts
keyedReduce.forEach(c -> LOG.info(c.toString()));
}
use of edu.iu.dsc.tws.tset.env.BatchEnvironment in project twister2 by DSC-SPIDAL.
the class CSVTSetSourceExample method execute.
@Override
public void execute(WorkerEnvironment workerEnv) {
BatchEnvironment env = TSetEnvironment.initBatch(workerEnv);
int dsize = 100;
int parallelism = 2;
int dimension = 2;
SourceTSet<String[]> pointSource = env.createCSVSource("/tmp/dinput", dsize, parallelism, "split");
ComputeTSet<double[][]> points = pointSource.direct().compute(new ComputeFunc<Iterator<String[]>, double[][]>() {
private double[][] localPoints = new double[dsize / parallelism][dimension];
@Override
public double[][] compute(Iterator<String[]> input) {
for (int i = 0; i < dsize / parallelism && input.hasNext(); i++) {
String[] value = input.next();
for (int j = 0; j < value.length; j++) {
localPoints[i][j] = Double.parseDouble(value[j]);
}
}
LOG.info("Double Array Values:" + Arrays.deepToString(localPoints));
return localPoints;
}
});
}
use of edu.iu.dsc.tws.tset.env.BatchEnvironment in project twister2 by DSC-SPIDAL.
the class KMeansTsetJob method execute.
@Override
public void execute(WorkerEnvironment workerEnv) {
BatchEnvironment env = TSetEnvironment.initBatch(workerEnv);
int workerId = env.getWorkerID();
LOG.info("TSet worker starting: " + workerId);
Config config = env.getConfig();
int parallelism = config.getIntegerValue(DataObjectConstants.PARALLELISM_VALUE);
int dimension = config.getIntegerValue(DataObjectConstants.DIMENSIONS);
int numFiles = config.getIntegerValue(DataObjectConstants.NUMBER_OF_FILES);
int dsize = config.getIntegerValue(DataObjectConstants.DSIZE);
int csize = config.getIntegerValue(DataObjectConstants.CSIZE);
int iterations = config.getIntegerValue(DataObjectConstants.ARGS_ITERATIONS);
String dataDirectory = config.getStringValue(DataObjectConstants.DINPUT_DIRECTORY) + workerId;
String centroidDirectory = config.getStringValue(DataObjectConstants.CINPUT_DIRECTORY) + workerId;
String type = config.getStringValue(DataObjectConstants.FILE_TYPE);
KMeansUtils.generateDataPoints(env.getConfig(), dimension, numFiles, dsize, csize, dataDirectory, centroidDirectory, type);
long startTime = System.currentTimeMillis();
/*CachedTSet<double[][]> points =
tc.createSource(new PointsSource(type), parallelismValue).setName("dataSource").cache();*/
SourceTSet<String[]> pointSource = env.createCSVSource(dataDirectory, dsize, parallelism, "split");
ComputeTSet<double[][]> points = pointSource.direct().compute(new ComputeFunc<Iterator<String[]>, double[][]>() {
private double[][] localPoints = new double[dsize / parallelism][dimension];
@Override
public double[][] compute(Iterator<String[]> input) {
for (int i = 0; i < dsize / parallelism && input.hasNext(); i++) {
String[] value = input.next();
for (int j = 0; j < value.length; j++) {
localPoints[i][j] = Double.parseDouble(value[j]);
}
}
return localPoints;
}
});
points.setName("dataSource").cache();
// CachedTSet<double[][]> centers = tc.createSource(new CenterSource(type), parallelism).cache();
SourceTSet<String[]> centerSource = env.createCSVSource(centroidDirectory, csize, parallelism, "complete");
ComputeTSet<double[][]> centers = centerSource.direct().compute(new ComputeFunc<Iterator<String[]>, double[][]>() {
private double[][] localCenters = new double[csize][dimension];
@Override
public double[][] compute(Iterator<String[]> input) {
for (int i = 0; i < csize && input.hasNext(); i++) {
String[] value = input.next();
for (int j = 0; j < dimension; j++) {
localCenters[i][j] = Double.parseDouble(value[j]);
}
}
return localCenters;
}
});
CachedTSet<double[][]> cachedCenters = centers.cache();
long endTimeData = System.currentTimeMillis();
ComputeTSet<double[][]> kmeansTSet = points.direct().map(new KMeansMap());
ComputeTSet<double[][]> reduced = kmeansTSet.allReduce((ReduceFunc<double[][]>) (t1, t2) -> {
double[][] newCentroids = new double[t1.length][t1[0].length];
for (int j = 0; j < t1.length; j++) {
for (int k = 0; k < t1[0].length; k++) {
double newVal = t1[j][k] + t2[j][k];
newCentroids[j][k] = newVal;
}
}
return newCentroids;
}).map(new AverageCenters());
kmeansTSet.addInput("centers", cachedCenters);
CachedTSet<double[][]> cached = reduced.lazyCache();
for (int i = 0; i < iterations; i++) {
env.evalAndUpdate(cached, cachedCenters);
}
env.finishEval(cached);
long endTime = System.currentTimeMillis();
if (workerId == 0) {
LOG.info("Data Load time : " + (endTimeData - startTime) + "\n" + "Total Time : " + (endTime - startTime) + "Compute Time : " + (endTime - endTimeData));
LOG.info("Final Centroids After\t" + iterations + "\titerations\t");
cachedCenters.direct().forEach(i -> LOG.info(Arrays.deepToString(i)));
}
}
Aggregations