use of edu.iu.dsc.tws.tset.fn.impl.ArrowBasedSinkFunction in project twister2 by DSC-SPIDAL.
the class ArrowTSetSourceExample method execute.
@Override
public void execute(WorkerEnvironment workerEnv) {
BatchEnvironment env = TSetEnvironment.initBatch(workerEnv);
Config config = env.getConfig();
String csvInputDirectory = config.getStringValue(DataObjectConstants.DINPUT_DIRECTORY);
String arrowInputDirectory = config.getStringValue(DataObjectConstants.ARROW_DIRECTORY);
String arrowFileName = config.getStringValue(DataObjectConstants.FILE_NAME);
int workers = config.getIntegerValue(DataObjectConstants.WORKERS);
int parallel = config.getIntegerValue(DataObjectConstants.PARALLELISM_VALUE);
int dsize = config.getIntegerValue(DataObjectConstants.DSIZE);
LOG.info("arrow input file:" + arrowFileName + "\t" + arrowInputDirectory + "\t" + csvInputDirectory + "\t" + workers + "\t" + parallel);
Schema schema = makeSchema();
SourceTSet<String[]> csvSource = env.createCSVSource(csvInputDirectory, dsize, parallel, "split");
SinkTSet<Iterator<Integer>> sinkTSet = csvSource.direct().map((MapFunc<String[], Integer>) input -> Integer.parseInt(input[0])).direct().sink(new ArrowBasedSinkFunction<>(arrowInputDirectory, arrowFileName, schema.toJson()));
env.run(sinkTSet);
// Source Function Call
env.createArrowSource(arrowInputDirectory, arrowFileName, parallel, schema.toJson()).direct().compute((ComputeFunc<Iterator<Object>, List<Integer>>) input -> {
List<Integer> integers = new ArrayList<>();
input.forEachRemaining(i -> integers.add((Integer) i));
return integers;
}).direct().forEach(s -> LOG.info("Integer Array Size:" + s.size() + "\tvalues:" + s));
}
Aggregations