use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.
the class DatasetInfoTypeAdapter method deserialize.
@Override
public DatasetInfo deserialize(JsonElement json, Type typeOfT, JsonDeserializationContext context) throws JsonParseException {
JsonObject obj = json.getAsJsonObject();
String datasetName = obj.get("datasetName").getAsString();
Map<String, String> datasetArgs = context.deserialize(obj.get("datasetArgs"), mapType);
if (obj.get("datasetSplitClass") == null) {
return new DatasetInfo(datasetName, datasetArgs, null);
}
String datasetSplitClass = obj.get("datasetSplitClass").getAsString();
ClassLoader classLoader = Objects.firstNonNull(Thread.currentThread().getContextClassLoader(), SparkBatchSourceFactory.class.getClassLoader());
try {
Class<?> splitClass = classLoader.loadClass(datasetSplitClass);
List<Split> splits = context.deserialize(obj.get("datasetSplits"), getListType(splitClass));
return new DatasetInfo(datasetName, datasetArgs, splits);
} catch (ClassNotFoundException e) {
throw new JsonParseException("Unable to deserialize splits", e);
}
}
use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.
the class ObjectStoreDatasetTest method verifySplits.
// helper to verify that the split readers for the given splits return exactly a set of keys
private void verifySplits(ObjectStoreDataset<String> t, List<Split> splits, SortedSet<Long> keysToVerify) throws InterruptedException {
// read each split and verify the keys, remove all read keys from the set
for (Split split : splits) {
SplitReader<byte[], String> reader = t.createSplitReader(split);
reader.initialize(split);
while (reader.nextKeyValue()) {
byte[] key = reader.getCurrentKey();
String value = reader.getCurrentValue();
// verify each row has the two columns written
Assert.assertEquals(Long.toString(Bytes.toLong(key)), value);
Assert.assertTrue(keysToVerify.remove(Bytes.toLong(key)));
}
}
// verify all keys have been read
if (!keysToVerify.isEmpty()) {
System.out.println("Remaining [" + keysToVerify.size() + "]: " + keysToVerify);
}
Assert.assertTrue(keysToVerify.isEmpty());
}
use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.
the class BasicMapReduceTaskContext method getBatchReadable.
/**
* Returns a {@link BatchReadable} that reads data from the given dataset.
*/
<K, V> BatchReadable<K, V> getBatchReadable(@Nullable String datasetNamespace, String datasetName, Map<String, String> datasetArgs) {
Dataset dataset;
if (datasetNamespace == null) {
dataset = getDataset(datasetName, datasetArgs, AccessType.READ);
} else {
dataset = getDataset(datasetNamespace, datasetName, datasetArgs, AccessType.READ);
}
// Must be BatchReadable.
Preconditions.checkArgument(dataset instanceof BatchReadable, "Dataset '%s' is not a BatchReadable.", datasetName);
@SuppressWarnings("unchecked") final BatchReadable<K, V> delegate = (BatchReadable<K, V>) dataset;
return new BatchReadable<K, V>() {
@Override
public List<Split> getSplits() {
try {
try {
return delegate.getSplits();
} finally {
flushOperations();
}
} catch (Exception e) {
throw Throwables.propagate(e);
}
}
@Override
public SplitReader<K, V> createSplitReader(Split split) {
return new ForwardingSplitReader<K, V>(delegate.createSplitReader(split)) {
@Override
public void close() {
try {
try {
super.close();
} finally {
flushOperations();
}
} catch (Exception e) {
throw Throwables.propagate(e);
}
}
};
}
};
}
use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.
the class AbstractBatchReadableInputFormat method setDatasetSplits.
/**
* Sets dataset and splits information into the given {@link Configuration}.
*
* @param hConf configuration to modify
* @param datasetNamespace namespace of the dataset
* @param datasetName name of the dataset
* @param datasetArguments arguments for the dataset
* @param splits list of splits on the dataset
* @throws IOException
*/
public static void setDatasetSplits(Configuration hConf, @Nullable String datasetNamespace, String datasetName, Map<String, String> datasetArguments, List<Split> splits) throws IOException {
if (datasetNamespace != null) {
hConf.set(DATASET_NAMESPACE, datasetNamespace);
}
hConf.set(DATASET_NAME, datasetName);
hConf.set(DATASET_ARGS, GSON.toJson(datasetArguments, DATASET_ARGS_TYPE));
// Encode the list of splits with size followed by that many of DataSetInputSplit objects.
ByteArrayDataOutput dataOutput = ByteStreams.newDataOutput();
dataOutput.writeInt(splits.size());
for (Split split : splits) {
new DataSetInputSplit(split).write(dataOutput);
}
hConf.set(SPLITS, Bytes.toStringBinary(dataOutput.toByteArray()));
}
use of io.cdap.cdap.api.data.batch.Split in project cdap by caskdata.
the class KeyValueTableTest method verifySplits.
// helper to verify that the split readers for the given splits return exactly a set of keys
private void verifySplits(KeyValueTable t, List<Split> splits, SortedSet<Long> keysToVerify) throws InterruptedException {
// read each split and verify the keys, remove all read keys from the set
for (Split split : splits) {
SplitReader<byte[], byte[]> reader = t.createSplitReader(split);
reader.initialize(split);
while (reader.nextKeyValue()) {
byte[] key = reader.getCurrentKey();
byte[] value = reader.getCurrentValue();
// verify each row has the two columns written
Assert.assertArrayEquals(key, value);
Assert.assertTrue(keysToVerify.remove(Bytes.toLong(key)));
}
}
// verify all keys have been read
if (!keysToVerify.isEmpty()) {
System.out.println("Remaining [" + keysToVerify.size() + "]: " + keysToVerify);
}
Assert.assertTrue(keysToVerify.isEmpty());
}
Aggregations