Examples with Split - io.cdap.cdap.api.data.batch.Split

Example 1 with Split

use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.

the class DatasetInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) {
        try {
            datasetAccessor.initialize();
        } catch (Exception e) {
            throw new IOException("Could not get dataset", e);
        }
        try (RecordScannable recordScannable = datasetAccessor.getDataset()) {
            Job job = new Job(jobConf);
            JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
            Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
            List<Split> dsSplits = recordScannable.getSplits();
            InputSplit[] inputSplits = new InputSplit[dsSplits.size()];
            for (int i = 0; i < dsSplits.size(); i++) {
                inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]);
            }
            return inputSplits;
        }
    }
}

Also used : Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) IOException(java.io.IOException) RecordScannable(io.cdap.cdap.api.data.batch.RecordScannable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) Split(io.cdap.cdap.api.data.batch.Split) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 2 with Split

use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.

the class BufferingTable method getSplits.

/**
 * Fallback implementation of getSplits, {@link SplitsUtil#primitiveGetSplits(int, byte[], byte[])}.
 * Ideally should be overridden by subclasses.
 *
 * @param numSplits Desired number of splits. If greater than zero, at most this many splits will be returned.
 *                  If less or equal to zero, any number of splits can be returned.
 * @param start If non-null, the returned splits will only cover keys that are greater or equal.
 * @param stop If non-null, the returned splits will only cover keys that are less.
 * @return list of {@link Split}
 */
@Override
public List<Split> getSplits(int numSplits, byte[] start, byte[] stop) {
    ensureTransactionIsStarted();
    List<KeyRange> keyRanges = SplitsUtil.primitiveGetSplits(numSplits, start, stop);
    return Lists.transform(keyRanges, new Function<KeyRange, Split>() {

        @Nullable
        @Override
        public Split apply(@Nullable KeyRange input) {
            return new TableSplit(input == null ? null : input.getStart(), input == null ? null : input.getStop());
        }
    });
}

Also used : TableSplit(io.cdap.cdap.api.dataset.table.TableSplit) Split(io.cdap.cdap.api.data.batch.Split) TableSplit(io.cdap.cdap.api.dataset.table.TableSplit) Nullable(javax.annotation.Nullable)

Example 3 with Split

use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.

the class DatasetInputFormatProvider method createBatchReadableConfiguration.

private Map<String, String> createBatchReadableConfiguration() {
    List<Split> splits = this.splits;
    if (splits == null) {
        splits = ((BatchReadable<?, ?>) dataset).getSplits();
    }
    Configuration hConf = new Configuration();
    hConf.clear();
    try {
        AbstractBatchReadableInputFormat.setDatasetSplits(hConf, datasetNamespace, datasetName, datasetArgs, splits);
        return ConfigurationUtil.toMap(hConf);
    } catch (IOException e) {
        throw new IllegalArgumentException(e);
    }
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) Split(io.cdap.cdap.api.data.batch.Split)

Example 4 with Split

use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.

the class ObjectStoreDatasetTest method testBatchCustomList.

@Test
public void testBatchCustomList() throws Exception {
    DatasetId customlist = DatasetFrameworkTestUtil.NAMESPACE_ID.dataset("customlist");
    createObjectStoreInstance(customlist, new TypeToken<List<Custom>>() {
    }.getType());
    final ObjectStoreDataset<List<Custom>> customStore = dsFrameworkUtil.getInstance(customlist);
    TransactionExecutor txnl = dsFrameworkUtil.newInMemoryTransactionExecutor(customStore);
    final SortedSet<Long> keysWritten = Sets.newTreeSet();
    txnl.execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            List<Custom> customList1 = Arrays.asList(new Custom(1, Lists.newArrayList("one", "ONE")), new Custom(2, Lists.newArrayList("two", "TWO")));
            Random rand = new Random(100);
            long key1 = rand.nextLong();
            keysWritten.add(key1);
            customStore.write(Bytes.toBytes(key1), customList1);
            List<Custom> customList2 = Arrays.asList(new Custom(3, Lists.newArrayList("three", "THREE")), new Custom(4, Lists.newArrayList("four", "FOUR")));
            long key2 = rand.nextLong();
            keysWritten.add(key2);
            customStore.write(Bytes.toBytes(key2), customList2);
        }
    });
    final SortedSet<Long> keysWrittenCopy = ImmutableSortedSet.copyOf(keysWritten);
    txnl.execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // get the splits for the table
            List<Split> splits = customStore.getSplits();
            for (Split split : splits) {
                SplitReader<byte[], List<Custom>> reader = customStore.createSplitReader(split);
                reader.initialize(split);
                while (reader.nextKeyValue()) {
                    byte[] key = reader.getCurrentKey();
                    Assert.assertTrue(keysWritten.remove(Bytes.toLong(key)));
                }
            }
            // verify all keys have been read
            if (!keysWritten.isEmpty()) {
                System.out.println("Remaining [" + keysWritten.size() + "]: " + keysWritten);
            }
            Assert.assertTrue(keysWritten.isEmpty());
        }
    });
    deleteAndVerifyInBatch(customStore, txnl, keysWrittenCopy);
    dsFrameworkUtil.deleteInstance(customlist);
}

Also used : SplitReader(io.cdap.cdap.api.data.batch.SplitReader) TransactionExecutor(org.apache.tephra.TransactionExecutor) TransactionFailureException(org.apache.tephra.TransactionFailureException) NoSuchElementException(java.util.NoSuchElementException) DatasetId(io.cdap.cdap.proto.id.DatasetId) Random(java.util.Random) TypeToken(com.google.common.reflect.TypeToken) List(java.util.List) Split(io.cdap.cdap.api.data.batch.Split) Test(org.junit.Test)

Example 5 with Split

use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.

the class ObjectMappedTableDatasetTest method testGetSplits.

@Test
public void testGetSplits() throws Exception {
    dsFrameworkUtil.createInstance(ObjectMappedTable.class.getName(), RECORDS_ID, ObjectMappedTableProperties.builder().setType(Record.class).build());
    try {
        final ObjectMappedTableDataset<Record> records = dsFrameworkUtil.getInstance(RECORDS_ID);
        TransactionExecutor txnl = dsFrameworkUtil.newInMemoryTransactionExecutor((TransactionAware) records);
        final Record record = new Record(Integer.MAX_VALUE, Long.MAX_VALUE, Float.MAX_VALUE, Double.MAX_VALUE, "foobar", Bytes.toBytes("foobar"), ByteBuffer.wrap(Bytes.toBytes("foobar")), UUID.randomUUID());
        final byte[] rowkey = Bytes.toBytes("row1");
        txnl.execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                records.write(rowkey, record);
            }
        });
        // should not include the record, since upper bound is not inclusive
        txnl.execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                List<Split> splits = records.getSplits(1, null, rowkey);
                List<Record> recordsRead = new ArrayList<>();
                for (Split split : splits) {
                    SplitReader<byte[], Record> splitReader = records.createSplitReader(split);
                    try {
                        splitReader.initialize(split);
                        while (splitReader.nextKeyValue()) {
                            recordsRead.add(splitReader.getCurrentValue());
                        }
                    } finally {
                        splitReader.close();
                    }
                }
                Assert.assertEquals(0, recordsRead.size());
            }
        });
        // should include the record, since lower bound is inclusive
        txnl.execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                List<Split> splits = records.getSplits(1, rowkey, null);
                List<Record> recordsRead = new ArrayList<>();
                for (Split split : splits) {
                    SplitReader<byte[], Record> splitReader = records.createSplitReader(split);
                    try {
                        splitReader.initialize(split);
                        while (splitReader.nextKeyValue()) {
                            recordsRead.add(splitReader.getCurrentValue());
                        }
                    } finally {
                        splitReader.close();
                    }
                }
                Assert.assertEquals(1, recordsRead.size());
                Assert.assertEquals(record, recordsRead.get(0));
            }
        });
    } finally {
        dsFrameworkUtil.deleteInstance(RECORDS_ID);
    }
}

Also used : SplitReader(io.cdap.cdap.api.data.batch.SplitReader) TransactionExecutor(org.apache.tephra.TransactionExecutor) ArrayList(java.util.ArrayList) List(java.util.List) ObjectMappedTable(io.cdap.cdap.api.dataset.lib.ObjectMappedTable) Split(io.cdap.cdap.api.data.batch.Split) Test(org.junit.Test)

Aggregations

Split (io.cdap.cdap.api.data.batch.Split)10 IOException (java.io.IOException)3 SplitReader (io.cdap.cdap.api.data.batch.SplitReader)2 List (java.util.List)2 TransactionExecutor (org.apache.tephra.TransactionExecutor)2 Test (org.junit.Test)2 ByteArrayDataOutput (com.google.common.io.ByteArrayDataOutput)1 TypeToken (com.google.common.reflect.TypeToken)1 JsonObject (com.google.gson.JsonObject)1 JsonParseException (com.google.gson.JsonParseException)1 DatasetInstantiationException (io.cdap.cdap.api.data.DatasetInstantiationException)1 BatchReadable (io.cdap.cdap.api.data.batch.BatchReadable)1 RecordScannable (io.cdap.cdap.api.data.batch.RecordScannable)1 Dataset (io.cdap.cdap.api.dataset.Dataset)1 ObjectMappedTable (io.cdap.cdap.api.dataset.lib.ObjectMappedTable)1 TableSplit (io.cdap.cdap.api.dataset.table.TableSplit)1 TopicNotFoundException (io.cdap.cdap.api.messaging.TopicNotFoundException)1 ForwardingSplitReader (io.cdap.cdap.internal.app.runtime.batch.dataset.ForwardingSplitReader)1 DatasetId (io.cdap.cdap.proto.id.DatasetId)1 UnauthorizedException (io.cdap.cdap.security.spi.authorization.UnauthorizedException)1