Search in sources :

Example 26 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method onSuccess.

@Override
public void onSuccess() throws DataSetException {
    String outputPath = FileSetArguments.getOutputPath(runtimeArguments);
    // Either way, we can't do much here.
    if (outputPath == null) {
        return;
    }
    // its possible that there is no output key, if using the DynamicPartitioner, in which case
    // DynamicPartitioningOutputFormat is responsible for registering the partitions and the metadata
    PartitionKey outputKey = PartitionedFileSetArguments.getOutputPartitionKey(runtimeArguments, getPartitioning());
    if (outputKey != null) {
        Map<String, String> metadata = PartitionedFileSetArguments.getOutputPartitionMetadata(runtimeArguments);
        addPartition(outputKey, outputPath, metadata, true);
    }
    // currently, FileSetDataset#onSuccess is a no-op, but call it, in case it does something in the future
    ((FileSetDataset) files).onSuccess();
}
Also used : FileSetDataset(co.cask.cdap.data2.dataset2.lib.file.FileSetDataset) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey)

Example 27 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionedFileSetDataset method getPartitions.

private void getPartitions(@Nullable PartitionFilter filter, PartitionConsumer consumer, boolean decodeMetadata, @Nullable byte[] startKey, @Nullable byte[] endKey, long limit) {
    long count = 0L;
    try (Scanner scanner = partitionsTable.scan(startKey, endKey)) {
        while (count < limit) {
            Row row = scanner.next();
            if (row == null) {
                break;
            }
            PartitionKey key;
            try {
                key = parseRowKey(row.getRow(), partitioning);
            } catch (IllegalArgumentException e) {
                if (!ignoreInvalidRowsSilently) {
                    LOG.debug(String.format("Failed to parse row key for partitioned file set '%s': %s", getName(), Bytes.toStringBinary(row.getRow())));
                }
                continue;
            }
            if (filter != null && !filter.match(key)) {
                continue;
            }
            byte[] pathBytes = row.get(RELATIVE_PATH);
            if (pathBytes != null) {
                consumer.consume(key, Bytes.toString(pathBytes), decodeMetadata ? metadataFromRow(row) : null);
            }
            count++;
        }
        if (count == 0) {
            warnIfInvalidPartitionFilter(filter, partitioning);
        }
    }
}
Also used : Scanner(co.cask.cdap.api.dataset.table.Scanner) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) Row(co.cask.cdap.api.dataset.table.Row)

Example 28 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class ScoreCounter method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Job job = context.getHadoopJob();
    job.setMapperClass(ResultsMapper.class);
    job.setReducerClass(TeamCounter.class);
    job.setNumReduceTasks(1);
    String league = context.getRuntimeArguments().get("league");
    Preconditions.checkNotNull(league);
    // Configure the input to read all seasons for the league
    Map<String, String> inputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addValueCondition("league", league).build());
    context.addInput(Input.ofDataset("results", inputArgs));
    // Each run writes its output to a partition for the league
    Map<String, String> outputArgs = Maps.newHashMap();
    PartitionKey outputKey = PartitionKey.builder().addStringField("league", league).build();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
    context.addOutput(Output.ofDataset("totals", outputArgs));
    // used only for logging:
    PartitionedFileSet input = context.getDataset("results", inputArgs);
    PartitionedFileSet outputFileSet = context.getDataset("totals", outputArgs);
    String outputPath = FileSetArguments.getOutputPath(outputFileSet.getEmbeddedFileSet().getRuntimeArguments());
    LOG.info("input: {}, output: {}", input.getEmbeddedFileSet().getInputLocations(), outputPath);
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Job(org.apache.hadoop.mapreduce.Job)

Example 29 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class PartitionKeyCodecTest method testSerDe.

@Test
public void testSerDe() {
    PartitionKey key = PartitionKey.builder().addField("a", "value,").addField("b", 1L).addField("c", -17).addField("d", true).addIntField("e", 42).addLongField("f", 15).addStringField("g", "value]}").build();
    Gson gson = new GsonBuilder().registerTypeAdapter(PartitionKey.class, new PartitionKeyCodec()).create();
    String serialized = gson.toJson(key);
    Assert.assertEquals(key, gson.fromJson(serialized, PartitionKey.class));
}
Also used : GsonBuilder(com.google.gson.GsonBuilder) PartitionKeyCodec(co.cask.cdap.api.dataset.lib.partitioned.PartitionKeyCodec) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) Gson(com.google.gson.Gson) Test(org.junit.Test)

Example 30 with PartitionKey

use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.

private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
    PartitionedFileSet pfs = pfsManager.get();
    PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
    Location location = partitionOutput.getLocation();
    prepareFileInput(location);
    partitionOutput.addPartition();
    pfsManager.flush();
    Map<String, String> inputArgs = new HashMap<>();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
    Map<String, String> outputArgs = new HashMap<>();
    final PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
    args.put("input", "pfs");
    args.put("output", "pfs");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
    pfsManager.flush();
    PartitionDetail partition = pfs.getPartition(outputKey);
    Assert.assertNotNull(partition);
    validateFileOutput(partition.getLocation());
    // Cleanup after test completed
    location.delete(true);
    partition.getLocation().delete(true);
    pfs.dropPartition(partitionOutput.getPartitionKey());
    pfs.dropPartition(partition.getPartitionKey());
    pfsManager.flush();
}
Also used : SparkManager(co.cask.cdap.test.SparkManager) PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) HashMap(java.util.HashMap) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Location(org.apache.twill.filesystem.Location)

Aggregations

PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)59 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)28 Test (org.junit.Test)27 TransactionAware (org.apache.tephra.TransactionAware)17 TransactionExecutor (org.apache.tephra.TransactionExecutor)17 IOException (java.io.IOException)12 HashMap (java.util.HashMap)12 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)11 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)11 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)11 ArrayList (java.util.ArrayList)11 List (java.util.List)11 HashSet (java.util.HashSet)10 DataSetException (co.cask.cdap.api.dataset.DataSetException)9 ImmutableList (com.google.common.collect.ImmutableList)9 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)7 Partition (co.cask.cdap.api.dataset.lib.Partition)7 ConsumerConfiguration (co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration)7 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)6 Location (org.apache.twill.filesystem.Location)6