use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class PartitionedFileSetDataset method onSuccess.
@Override
public void onSuccess() throws DataSetException {
String outputPath = FileSetArguments.getOutputPath(runtimeArguments);
// Either way, we can't do much here.
if (outputPath == null) {
return;
}
// its possible that there is no output key, if using the DynamicPartitioner, in which case
// DynamicPartitioningOutputFormat is responsible for registering the partitions and the metadata
PartitionKey outputKey = PartitionedFileSetArguments.getOutputPartitionKey(runtimeArguments, getPartitioning());
if (outputKey != null) {
Map<String, String> metadata = PartitionedFileSetArguments.getOutputPartitionMetadata(runtimeArguments);
addPartition(outputKey, outputPath, metadata, true);
}
// currently, FileSetDataset#onSuccess is a no-op, but call it, in case it does something in the future
((FileSetDataset) files).onSuccess();
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class PartitionedFileSetDataset method getPartitions.
private void getPartitions(@Nullable PartitionFilter filter, PartitionConsumer consumer, boolean decodeMetadata, @Nullable byte[] startKey, @Nullable byte[] endKey, long limit) {
long count = 0L;
try (Scanner scanner = partitionsTable.scan(startKey, endKey)) {
while (count < limit) {
Row row = scanner.next();
if (row == null) {
break;
}
PartitionKey key;
try {
key = parseRowKey(row.getRow(), partitioning);
} catch (IllegalArgumentException e) {
if (!ignoreInvalidRowsSilently) {
LOG.debug(String.format("Failed to parse row key for partitioned file set '%s': %s", getName(), Bytes.toStringBinary(row.getRow())));
}
continue;
}
if (filter != null && !filter.match(key)) {
continue;
}
byte[] pathBytes = row.get(RELATIVE_PATH);
if (pathBytes != null) {
consumer.consume(key, Bytes.toString(pathBytes), decodeMetadata ? metadataFromRow(row) : null);
}
count++;
}
if (count == 0) {
warnIfInvalidPartitionFilter(filter, partitioning);
}
}
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class ScoreCounter method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
Job job = context.getHadoopJob();
job.setMapperClass(ResultsMapper.class);
job.setReducerClass(TeamCounter.class);
job.setNumReduceTasks(1);
String league = context.getRuntimeArguments().get("league");
Preconditions.checkNotNull(league);
// Configure the input to read all seasons for the league
Map<String, String> inputArgs = Maps.newHashMap();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addValueCondition("league", league).build());
context.addInput(Input.ofDataset("results", inputArgs));
// Each run writes its output to a partition for the league
Map<String, String> outputArgs = Maps.newHashMap();
PartitionKey outputKey = PartitionKey.builder().addStringField("league", league).build();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
context.addOutput(Output.ofDataset("totals", outputArgs));
// used only for logging:
PartitionedFileSet input = context.getDataset("results", inputArgs);
PartitionedFileSet outputFileSet = context.getDataset("totals", outputArgs);
String outputPath = FileSetArguments.getOutputPath(outputFileSet.getEmbeddedFileSet().getRuntimeArguments());
LOG.info("input: {}, output: {}", input.getEmbeddedFileSet().getInputLocations(), outputPath);
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class PartitionKeyCodecTest method testSerDe.
@Test
public void testSerDe() {
PartitionKey key = PartitionKey.builder().addField("a", "value,").addField("b", 1L).addField("c", -17).addField("d", true).addIntField("e", 42).addLongField("f", 15).addStringField("g", "value]}").build();
Gson gson = new GsonBuilder().registerTypeAdapter(PartitionKey.class, new PartitionKeyCodec()).create();
String serialized = gson.toJson(key);
Assert.assertEquals(key, gson.fromJson(serialized, PartitionKey.class));
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.
private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
PartitionedFileSet pfs = pfsManager.get();
PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
Location location = partitionOutput.getLocation();
prepareFileInput(location);
partitionOutput.addPartition();
pfsManager.flush();
Map<String, String> inputArgs = new HashMap<>();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
Map<String, String> outputArgs = new HashMap<>();
final PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
args.put("input", "pfs");
args.put("output", "pfs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
pfsManager.flush();
PartitionDetail partition = pfs.getPartition(outputKey);
Assert.assertNotNull(partition);
validateFileOutput(partition.getLocation());
// Cleanup after test completed
location.delete(true);
partition.getLocation().delete(true);
pfs.dropPartition(partitionOutput.getPartitionKey());
pfs.dropPartition(partition.getPartitionKey());
pfsManager.flush();
}
Aggregations