Search in sources :

Example 6 with TSetContext

use of edu.iu.dsc.tws.api.tset.TSetContext in project twister2 by DSC-SPIDAL.

the class TSetTeraSort method execute.

@Override
public void execute(WorkerEnvironment workerEnv) {
    BatchEnvironment env = TSetEnvironment.initBatch(workerEnv);
    final int parallelism = env.getConfig().getIntegerValue(PARAM_PARALLELISM);
    final int dataSize = env.getConfig().getIntegerValue(PARAM_DATA_SIZE_GB);
    KeyedSourceTSet<byte[], byte[]> keyedSource = env.createKeyedSource(new SourceFunc<Tuple<byte[], byte[]>>() {

        private Queue<byte[]> keys = new LinkedList<>();

        private byte[] data = new byte[90];

        @Override
        public void prepare(TSetContext context) {
            Arrays.fill(data, (byte) 1);
            Random random = new Random();
            int noOfTuples = (int) ((dataSize * 1024 * 1024 * 1024 * 1.0d) / parallelism / 100);
            if (context.getIndex() == 0) {
                LOG.info(noOfTuples + " tuples will be produced in each source");
            }
            for (int i = 0; i < noOfTuples; i++) {
                byte[] key = new byte[10];
                random.nextBytes(key);
                keys.add(key);
            }
        }

        @Override
        public boolean hasNext() {
            return !keys.isEmpty();
        }

        @Override
        public Tuple<byte[], byte[]> next() {
            return new Tuple<>(keys.poll(), data);
        }
    }, parallelism);
    keyedSource.keyedGather(new PartitionFunc<byte[]>() {

        protected int keysToOneTask;

        protected int[] destinationsList;

        @Override
        public void prepare(Set<Integer> sources, Set<Integer> destinations) {
            // considering only most significant bytes of array
            int totalPossibilities = 256 * 256;
            this.keysToOneTask = (int) Math.ceil(totalPossibilities / (double) destinations.size());
            this.destinationsList = new int[destinations.size()];
            int index = 0;
            for (int i : destinations) {
                destinationsList[index++] = i;
            }
            Arrays.sort(this.destinationsList);
        }

        int getIndex(byte[] array) {
            int key = ((array[0] & 0xff) << 8) + (array[1] & 0xff);
            return key / keysToOneTask;
        }

        @Override
        public int partition(int sourceIndex, byte[] val) {
            return this.destinationsList[this.getIndex(val)];
        }

        @Override
        public void commit(int source, int partition) {
        }
    }, (left, right) -> ByteArrayComparator.getInstance().compare(left, right)).useDisk().forEach(new ApplyFunc<Tuple<byte[], Iterator<byte[]>>>() {

        private byte[] previousKey;

        @Override
        public void apply(Tuple<byte[], Iterator<byte[]>> data) {
            if (previousKey != null) {
                int compare = ByteArrayComparator.getInstance().compare(previousKey, data.getKey());
                if (compare > 0) {
                    LOG.warning("Unsorted keys detected. TeraSort has failed. " + compare);
                }
            }
            previousKey = data.getKey();
        }
    });
}
Also used : BatchEnvironment(edu.iu.dsc.tws.tset.env.BatchEnvironment) LinkedList(java.util.LinkedList) TSetContext(edu.iu.dsc.tws.api.tset.TSetContext) Random(java.util.Random) Iterator(java.util.Iterator) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple)

Example 7 with TSetContext

use of edu.iu.dsc.tws.api.tset.TSetContext in project twister2 by DSC-SPIDAL.

the class SetSchemaExample method execute.

@Override
public void execute(WorkerEnvironment workerEnv) {
    BatchEnvironment env = TSetEnvironment.initBatch(workerEnv);
    SourceTSet<Integer> src = env.createSource(new BaseSourceFunc<Integer>() {

        private int i = 0;

        @Override
        public void prepare(TSetContext ctx) {
            super.prepare(ctx);
            LOG.info("schemas0: " + ctx.getInputSchema() + " -> " + ctx.getOutputSchema());
        }

        @Override
        public boolean hasNext() {
            return i == 0;
        }

        @Override
        public Integer next() {
            return ++i;
        }
    }, 2).setName("src");
    src.direct().forEach(ii -> LOG.info("out0: " + ii));
    src.withSchema(PrimitiveSchemas.INTEGER).direct().forEach(ii -> LOG.info("out1: " + ii));
    ComputeTSet<String> map = src.allReduce(Integer::sum).map(new BaseMapFunc<Integer, String>() {

        @Override
        public void prepare(TSetContext ctx) {
            super.prepare(ctx);
            LOG.info("schemas1: " + ctx.getInputSchema() + " -> " + ctx.getOutputSchema());
        }

        @Override
        public String map(Integer input) {
            return input.toString();
        }
    });
    map.direct().forEach(ii -> LOG.info("out2: " + ii));
    map.withSchema(PrimitiveSchemas.STRING).direct().forEach(ii -> LOG.info("out3: " + ii));
    KeyedTSet<String, Integer> keyed = map.mapToTuple(new BaseMapFunc<String, Tuple<String, Integer>>() {

        @Override
        public void prepare(TSetContext ctx) {
            super.prepare(ctx);
            LOG.info("schemas2: " + ctx.getInputSchema() + " -> " + ctx.getOutputSchema());
        }

        @Override
        public Tuple<String, Integer> map(String input) {
            return new Tuple<>(input, Integer.parseInt(input));
        }
    });
    keyed.keyedDirect().forEach(ii -> LOG.info("out4: " + ii));
    keyed.withSchema(new KeyedSchema(MessageTypes.STRING, MessageTypes.INTEGER)).keyedDirect().forEach(ii -> LOG.info("out5: " + ii));
}
Also used : KeyedSchema(edu.iu.dsc.tws.api.tset.schema.KeyedSchema) BatchEnvironment(edu.iu.dsc.tws.tset.env.BatchEnvironment) TSetContext(edu.iu.dsc.tws.api.tset.TSetContext) BaseSourceFunc(edu.iu.dsc.tws.api.tset.fn.BaseSourceFunc) Tuple(edu.iu.dsc.tws.api.comms.structs.Tuple)

Example 8 with TSetContext

use of edu.iu.dsc.tws.api.tset.TSetContext in project twister2 by DSC-SPIDAL.

the class CheckpointingExample method execute.

@Override
public void execute(WorkerEnvironment workerEnvironment) {
    BatchChkPntEnvironment env = TSetEnvironment.initCheckpointing(workerEnvironment);
    int count = 5;
    SourceTSet<Integer> src = dummySource(env, count, 100 * env.getWorkerID());
    PersistedTSet<Integer> persist = src.direct().persist();
    SourceTSet<Integer> src1 = dummySource(env, count, 100 * env.getWorkerID() + 10);
    src1.direct().compute(new BaseComputeFunc<Iterator<Integer>, String>() {

        private DataPartitionConsumer<Integer> in;

        @Override
        public void prepare(TSetContext ctx) {
            super.prepare(ctx);
            in = (DataPartitionConsumer<Integer>) ctx.getInput("in").getConsumer();
        }

        @Override
        public String compute(Iterator<Integer> input) {
            StringBuilder out = new StringBuilder();
            while (input.hasNext() && in.hasNext()) {
                out.append("(").append(input.next()).append(",").append(in.next()).append(") ");
            }
            return out.toString();
        }
    }).addInput("in", persist).direct().forEach(i -> LOG.info(i));
    persist.direct().forEach(i -> LOG.info(i.toString()));
}
Also used : TSetContext(edu.iu.dsc.tws.api.tset.TSetContext) BatchChkPntEnvironment(edu.iu.dsc.tws.tset.env.BatchChkPntEnvironment) BaseComputeFunc(edu.iu.dsc.tws.api.tset.fn.BaseComputeFunc) Iterator(java.util.Iterator) DataPartitionConsumer(edu.iu.dsc.tws.api.dataset.DataPartitionConsumer)

Example 9 with TSetContext

use of edu.iu.dsc.tws.api.tset.TSetContext in project twister2 by DSC-SPIDAL.

the class WordCount method execute.

@Override
public void execute(WorkerEnvironment workerEnvironment) {
    StreamingEnvironment cEnv = TSetEnvironment.initStreaming(workerEnvironment);
    // create source and aggregator
    cEnv.createSource(new SourceFunc<String>() {

        // sample words
        private List<String> sampleWords = new ArrayList<>();

        // the random used to pick he words
        private Random random;

        @Override
        public void prepare(TSetContext context) {
            this.random = new Random();
            RandomString randomString = new RandomString(MAX_CHARS, random, RandomString.ALPHANUM);
            for (int i = 0; i < NO_OF_SAMPLE_WORDS; i++) {
                sampleWords.add(randomString.nextRandomSizeString());
            }
        }

        @Override
        public boolean hasNext() {
            return true;
        }

        @Override
        public String next() {
            return sampleWords.get(random.nextInt(sampleWords.size()));
        }
    }, 4).partition(new HashingPartitioner<>()).sink(new SinkFunc<String>() {

        // keep track of the counts
        private Map<String, Integer> counts = new HashMap<>();

        private TSetContext context;

        @Override
        public void prepare(TSetContext context) {
            this.context = context;
        }

        @Override
        public boolean add(String word) {
            int count = 1;
            if (counts.containsKey(word)) {
                count = counts.get(word);
                count++;
            }
            counts.put(word, count);
            LOG.log(Level.INFO, String.format("%d Word %s count %s", context.getIndex(), word, count));
            return true;
        }
    });
    // start executing the streaming graph
    cEnv.run();
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) RandomString(edu.iu.dsc.tws.examples.utils.RandomString) RandomString(edu.iu.dsc.tws.examples.utils.RandomString) StreamingEnvironment(edu.iu.dsc.tws.tset.env.StreamingEnvironment) TSetContext(edu.iu.dsc.tws.api.tset.TSetContext) Random(java.util.Random) HashingPartitioner(edu.iu.dsc.tws.tset.fn.HashingPartitioner)

Aggregations

TSetContext (edu.iu.dsc.tws.api.tset.TSetContext)9 BatchEnvironment (edu.iu.dsc.tws.tset.env.BatchEnvironment)6 Iterator (java.util.Iterator)6 Tuple (edu.iu.dsc.tws.api.comms.structs.Tuple)5 DataPartitionConsumer (edu.iu.dsc.tws.api.dataset.DataPartitionConsumer)4 HashMap (java.util.HashMap)3 Random (java.util.Random)3 BaseComputeFunc (edu.iu.dsc.tws.api.tset.fn.BaseComputeFunc)2 BatchChkPntEnvironment (edu.iu.dsc.tws.tset.env.BatchChkPntEnvironment)2 ArrayList (java.util.ArrayList)2 BaseComputeCollectorFunc (edu.iu.dsc.tws.api.tset.fn.BaseComputeCollectorFunc)1 BaseSourceFunc (edu.iu.dsc.tws.api.tset.fn.BaseSourceFunc)1 RecordCollector (edu.iu.dsc.tws.api.tset.fn.RecordCollector)1 SourceFunc (edu.iu.dsc.tws.api.tset.fn.SourceFunc)1 BatchRowTLink (edu.iu.dsc.tws.api.tset.link.batch.BatchRowTLink)1 KeyedSchema (edu.iu.dsc.tws.api.tset.schema.KeyedSchema)1 RowSchema (edu.iu.dsc.tws.api.tset.schema.RowSchema)1 Row (edu.iu.dsc.tws.common.table.Row)1 TField (edu.iu.dsc.tws.common.table.TField)1 TwoRow (edu.iu.dsc.tws.common.table.TwoRow)1