Search in sources :

Example 1 with ArrowBasedSinkFunction

use of edu.iu.dsc.tws.tset.fn.impl.ArrowBasedSinkFunction in project twister2 by DSC-SPIDAL.

the class ArrowTSetSourceExample method execute.

@Override
public void execute(WorkerEnvironment workerEnv) {
    BatchEnvironment env = TSetEnvironment.initBatch(workerEnv);
    Config config = env.getConfig();
    String csvInputDirectory = config.getStringValue(DataObjectConstants.DINPUT_DIRECTORY);
    String arrowInputDirectory = config.getStringValue(DataObjectConstants.ARROW_DIRECTORY);
    String arrowFileName = config.getStringValue(DataObjectConstants.FILE_NAME);
    int workers = config.getIntegerValue(DataObjectConstants.WORKERS);
    int parallel = config.getIntegerValue(DataObjectConstants.PARALLELISM_VALUE);
    int dsize = config.getIntegerValue(DataObjectConstants.DSIZE);
    LOG.info("arrow input file:" + arrowFileName + "\t" + arrowInputDirectory + "\t" + csvInputDirectory + "\t" + workers + "\t" + parallel);
    Schema schema = makeSchema();
    SourceTSet<String[]> csvSource = env.createCSVSource(csvInputDirectory, dsize, parallel, "split");
    SinkTSet<Iterator<Integer>> sinkTSet = csvSource.direct().map((MapFunc<String[], Integer>) input -> Integer.parseInt(input[0])).direct().sink(new ArrowBasedSinkFunction<>(arrowInputDirectory, arrowFileName, schema.toJson()));
    env.run(sinkTSet);
    // Source Function Call
    env.createArrowSource(arrowInputDirectory, arrowFileName, parallel, schema.toJson()).direct().compute((ComputeFunc<Iterator<Object>, List<Integer>>) input -> {
        List<Integer> integers = new ArrayList<>();
        input.forEachRemaining(i -> integers.add((Integer) i));
        return integers;
    }).direct().forEach(s -> LOG.info("Integer Array Size:" + s.size() + "\tvalues:" + s));
}
Also used : Twister2Job(edu.iu.dsc.tws.api.Twister2Job) ArrowBasedSinkFunction(edu.iu.dsc.tws.tset.fn.impl.ArrowBasedSinkFunction) Schema(org.apache.arrow.vector.types.pojo.Schema) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) ResourceAllocator(edu.iu.dsc.tws.rsched.core.ResourceAllocator) Options(org.apache.commons.cli.Options) BatchEnvironment(edu.iu.dsc.tws.tset.env.BatchEnvironment) HashMap(java.util.HashMap) Config(edu.iu.dsc.tws.api.config.Config) MapFunc(edu.iu.dsc.tws.api.tset.fn.MapFunc) JobConfig(edu.iu.dsc.tws.api.JobConfig) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) DefaultParser(org.apache.commons.cli.DefaultParser) ImmutableList(com.google.common.collect.ImmutableList) CommandLine(org.apache.commons.cli.CommandLine) Iterator(java.util.Iterator) SourceTSet(edu.iu.dsc.tws.tset.sets.batch.SourceTSet) CommandLineParser(org.apache.commons.cli.CommandLineParser) FieldType(org.apache.arrow.vector.types.pojo.FieldType) SinkTSet(edu.iu.dsc.tws.tset.sets.batch.SinkTSet) Field(org.apache.arrow.vector.types.pojo.Field) Logger(java.util.logging.Logger) Utils(edu.iu.dsc.tws.examples.Utils) DataObjectConstants(edu.iu.dsc.tws.data.utils.DataObjectConstants) Serializable(java.io.Serializable) Twister2Submitter(edu.iu.dsc.tws.rsched.job.Twister2Submitter) List(java.util.List) WorkerEnvironment(edu.iu.dsc.tws.api.resource.WorkerEnvironment) TSetEnvironment(edu.iu.dsc.tws.tset.env.TSetEnvironment) ComputeFunc(edu.iu.dsc.tws.api.tset.fn.ComputeFunc) Twister2Worker(edu.iu.dsc.tws.api.resource.Twister2Worker) BatchEnvironment(edu.iu.dsc.tws.tset.env.BatchEnvironment) Config(edu.iu.dsc.tws.api.config.Config) JobConfig(edu.iu.dsc.tws.api.JobConfig) Schema(org.apache.arrow.vector.types.pojo.Schema) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List)

Aggregations

ImmutableList (com.google.common.collect.ImmutableList)1 JobConfig (edu.iu.dsc.tws.api.JobConfig)1 Twister2Job (edu.iu.dsc.tws.api.Twister2Job)1 Config (edu.iu.dsc.tws.api.config.Config)1 Twister2Worker (edu.iu.dsc.tws.api.resource.Twister2Worker)1 WorkerEnvironment (edu.iu.dsc.tws.api.resource.WorkerEnvironment)1 ComputeFunc (edu.iu.dsc.tws.api.tset.fn.ComputeFunc)1 MapFunc (edu.iu.dsc.tws.api.tset.fn.MapFunc)1 DataObjectConstants (edu.iu.dsc.tws.data.utils.DataObjectConstants)1 Utils (edu.iu.dsc.tws.examples.Utils)1 ResourceAllocator (edu.iu.dsc.tws.rsched.core.ResourceAllocator)1 Twister2Submitter (edu.iu.dsc.tws.rsched.job.Twister2Submitter)1 BatchEnvironment (edu.iu.dsc.tws.tset.env.BatchEnvironment)1 TSetEnvironment (edu.iu.dsc.tws.tset.env.TSetEnvironment)1 ArrowBasedSinkFunction (edu.iu.dsc.tws.tset.fn.impl.ArrowBasedSinkFunction)1 SinkTSet (edu.iu.dsc.tws.tset.sets.batch.SinkTSet)1 SourceTSet (edu.iu.dsc.tws.tset.sets.batch.SourceTSet)1 Serializable (java.io.Serializable)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1