Search in sources :

Example 6 with Split

use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.

the class DatasetInfoTypeAdapter method deserialize.

@Override
public DatasetInfo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException {
    JsonObject obj = json.getAsJsonObject();
    String datasetName = obj.get("datasetName").getAsString();
    Map<String, String> datasetArgs = context.deserialize(obj.get("datasetArgs"), mapType);
    if (obj.get("datasetSplitClass") == null) {
        return new DatasetInfo(datasetName, datasetArgs, null);
    }
    String datasetSplitClass = obj.get("datasetSplitClass").getAsString();
    ClassLoader classLoader = Objects.firstNonNull(Thread.currentThread().getContextClassLoader(), SparkBatchSourceFactory.class.getClassLoader());
    try {
        Class<?> splitClass = classLoader.loadClass(datasetSplitClass);
        List<Split> splits = context.deserialize(obj.get("datasetSplits"), getListType(splitClass));
        return new DatasetInfo(datasetName, datasetArgs, splits);
    } catch (ClassNotFoundException e) {
        throw new JsonParseException("Unable to deserialize splits", e);
    }
}
Also used : JsonObject(com.google.gson.JsonObject) Split(io.cdap.cdap.api.data.batch.Split) JsonParseException(com.google.gson.JsonParseException)

Example 7 with Split

use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.

the class ObjectStoreDatasetTest method verifySplits.

// helper to verify that the split readers for the given splits return exactly a set of keys
private void verifySplits(ObjectStoreDataset<String> t, List<Split> splits, SortedSet<Long> keysToVerify) throws InterruptedException {
    // read each split and verify the keys, remove all read keys from the set
    for (Split split : splits) {
        SplitReader<byte[], String> reader = t.createSplitReader(split);
        reader.initialize(split);
        while (reader.nextKeyValue()) {
            byte[] key = reader.getCurrentKey();
            String value = reader.getCurrentValue();
            // verify each row has the two columns written
            Assert.assertEquals(Long.toString(Bytes.toLong(key)), value);
            Assert.assertTrue(keysToVerify.remove(Bytes.toLong(key)));
        }
    }
    // verify all keys have been read
    if (!keysToVerify.isEmpty()) {
        System.out.println("Remaining [" + keysToVerify.size() + "]: " + keysToVerify);
    }
    Assert.assertTrue(keysToVerify.isEmpty());
}
Also used : Split(io.cdap.cdap.api.data.batch.Split)

Example 8 with Split

use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.

the class BasicMapReduceTaskContext method getBatchReadable.

/**
 * Returns a {@link BatchReadable} that reads data from the given dataset.
 */
<K, V> BatchReadable<K, V> getBatchReadable(@Nullable String datasetNamespace, String datasetName, Map<String, String> datasetArgs) {
    Dataset dataset;
    if (datasetNamespace == null) {
        dataset = getDataset(datasetName, datasetArgs, AccessType.READ);
    } else {
        dataset = getDataset(datasetNamespace, datasetName, datasetArgs, AccessType.READ);
    }
    // Must be BatchReadable.
    Preconditions.checkArgument(dataset instanceof BatchReadable, "Dataset '%s' is not a BatchReadable.", datasetName);
    @SuppressWarnings("unchecked") final BatchReadable<K, V> delegate = (BatchReadable<K, V>) dataset;
    return new BatchReadable<K, V>() {

        @Override
        public List<Split> getSplits() {
            try {
                try {
                    return delegate.getSplits();
                } finally {
                    flushOperations();
                }
            } catch (Exception e) {
                throw Throwables.propagate(e);
            }
        }

        @Override
        public SplitReader<K, V> createSplitReader(Split split) {
            return new ForwardingSplitReader<K, V>(delegate.createSplitReader(split)) {

                @Override
                public void close() {
                    try {
                        try {
                            super.close();
                        } finally {
                            flushOperations();
                        }
                    } catch (Exception e) {
                        throw Throwables.propagate(e);
                    }
                }
            };
        }
    };
}
Also used : ForwardingSplitReader(io.cdap.cdap.internal.app.runtime.batch.dataset.ForwardingSplitReader) Dataset(io.cdap.cdap.api.dataset.Dataset) BatchReadable(io.cdap.cdap.api.data.batch.BatchReadable) Split(io.cdap.cdap.api.data.batch.Split) FileNotFoundException(java.io.FileNotFoundException) DatasetInstantiationException(io.cdap.cdap.api.data.DatasetInstantiationException) UnauthorizedException(io.cdap.cdap.security.spi.authorization.UnauthorizedException) IOException(java.io.IOException) TopicNotFoundException(io.cdap.cdap.api.messaging.TopicNotFoundException)

Example 9 with Split

use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.

the class AbstractBatchReadableInputFormat method setDatasetSplits.

/**
 * Sets dataset and splits information into the given {@link Configuration}.
 *
 * @param hConf            configuration to modify
 * @param datasetNamespace namespace of the dataset
 * @param datasetName      name of the dataset
 * @param datasetArguments arguments for the dataset
 * @param splits           list of splits on the dataset
 * @throws IOException
 */
public static void setDatasetSplits(Configuration hConf, @Nullable String datasetNamespace, String datasetName, Map<String, String> datasetArguments, List<Split> splits) throws IOException {
    if (datasetNamespace != null) {
        hConf.set(DATASET_NAMESPACE, datasetNamespace);
    }
    hConf.set(DATASET_NAME, datasetName);
    hConf.set(DATASET_ARGS, GSON.toJson(datasetArguments, DATASET_ARGS_TYPE));
    // Encode the list of splits with size followed by that many of DataSetInputSplit objects.
    ByteArrayDataOutput dataOutput = ByteStreams.newDataOutput();
    dataOutput.writeInt(splits.size());
    for (Split split : splits) {
        new DataSetInputSplit(split).write(dataOutput);
    }
    hConf.set(SPLITS, Bytes.toStringBinary(dataOutput.toByteArray()));
}
Also used : ByteArrayDataOutput(com.google.common.io.ByteArrayDataOutput) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Split(io.cdap.cdap.api.data.batch.Split)

Example 10 with Split

use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.

the class KeyValueTableTest method verifySplits.

// helper to verify that the split readers for the given splits return exactly a set of keys
private void verifySplits(KeyValueTable t, List<Split> splits, SortedSet<Long> keysToVerify) throws InterruptedException {
    // read each split and verify the keys, remove all read keys from the set
    for (Split split : splits) {
        SplitReader<byte[], byte[]> reader = t.createSplitReader(split);
        reader.initialize(split);
        while (reader.nextKeyValue()) {
            byte[] key = reader.getCurrentKey();
            byte[] value = reader.getCurrentValue();
            // verify each row has the two columns written
            Assert.assertArrayEquals(key, value);
            Assert.assertTrue(keysToVerify.remove(Bytes.toLong(key)));
        }
    }
    // verify all keys have been read
    if (!keysToVerify.isEmpty()) {
        System.out.println("Remaining [" + keysToVerify.size() + "]: " + keysToVerify);
    }
    Assert.assertTrue(keysToVerify.isEmpty());
}
Also used : Split(io.cdap.cdap.api.data.batch.Split)

Aggregations

Split (io.cdap.cdap.api.data.batch.Split)10 IOException (java.io.IOException)3 SplitReader (io.cdap.cdap.api.data.batch.SplitReader)2 List (java.util.List)2 TransactionExecutor (org.apache.tephra.TransactionExecutor)2 Test (org.junit.Test)2 ByteArrayDataOutput (com.google.common.io.ByteArrayDataOutput)1 TypeToken (com.google.common.reflect.TypeToken)1 JsonObject (com.google.gson.JsonObject)1 JsonParseException (com.google.gson.JsonParseException)1 DatasetInstantiationException (io.cdap.cdap.api.data.DatasetInstantiationException)1 BatchReadable (io.cdap.cdap.api.data.batch.BatchReadable)1 RecordScannable (io.cdap.cdap.api.data.batch.RecordScannable)1 Dataset (io.cdap.cdap.api.dataset.Dataset)1 ObjectMappedTable (io.cdap.cdap.api.dataset.lib.ObjectMappedTable)1 TableSplit (io.cdap.cdap.api.dataset.table.TableSplit)1 TopicNotFoundException (io.cdap.cdap.api.messaging.TopicNotFoundException)1 ForwardingSplitReader (io.cdap.cdap.internal.app.runtime.batch.dataset.ForwardingSplitReader)1 DatasetId (io.cdap.cdap.proto.id.DatasetId)1 UnauthorizedException (io.cdap.cdap.security.spi.authorization.UnauthorizedException)1