use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class ClicksAndViewsMapReduceTest method getDataFromFile.
private Set<String> getDataFromFile() throws Exception {
DataSetManager<PartitionedFileSet> cleanRecords = getDataset(ClicksAndViews.JOINED);
Set<String> cleanData = new HashSet<>();
// we configured the MapReduce to write to this partition when starting it
PartitionDetail partition = cleanRecords.get().getPartition(PartitionKey.builder().addLongField("runtime", OUTPUT_PARTITION_RUNTIME).build());
Assert.assertNotNull(partition);
for (Location location : partition.getLocation().list()) {
if (location.getName().startsWith("part-")) {
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(location.getInputStream()))) {
String line;
while ((line = bufferedReader.readLine()) != null) {
cleanData.add(line);
}
}
}
}
return cleanData;
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class ConcurrentPartitionConsumer method selectPartitions.
private List<PartitionDetail> selectPartitions(PartitionAcceptor acceptor, ConsumerWorkingSet workingSet) {
long now = System.currentTimeMillis();
List<PartitionDetail> toConsume = new ArrayList<>();
Iterator<ConsumablePartition> iter = workingSet.getPartitions().iterator();
while (iter.hasNext()) {
ConsumablePartition consumablePartition = iter.next();
if (ProcessState.AVAILABLE != consumablePartition.getProcessState()) {
continue;
}
PartitionDetail partition = getPartitionedFileSet().getPartition(consumablePartition.getPartitionKey());
if (partition == null) {
// no longer exists, so skip it and remove it from the working set
iter.remove();
continue;
}
PartitionAcceptor.Return accept = acceptor.accept(partition);
switch(accept) {
case ACCEPT:
consumablePartition.take();
consumablePartition.setTimestamp(now);
toConsume.add(partition);
continue;
case SKIP:
continue;
case STOP:
return toConsume;
}
}
return toConsume;
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class ConsumerWorkingSet method populate.
/**
* Populates the ConsumerWorkingSet by fetching partitions from the given PartitionedFileSet.
*
* @param partitionedFileSet the PartitionedFileSet to fetch partitions from
* @param configuration the ConsumerConfiguration which defines parameters for consuming
*/
public void populate(PartitionedFileSet partitionedFileSet, ConsumerConfiguration configuration) {
int numToPopulate = configuration.getMaxWorkingSetSize() - partitions.size();
Predicate<PartitionDetail> predicate = configuration.getPartitionPredicate();
co.cask.cdap.api.dataset.lib.PartitionConsumerResult result = partitionedFileSet.consumePartitions(partitionConsumerState, numToPopulate, predicate);
List<PartitionDetail> partitions = result.getPartitions();
for (PartitionDetail partition : partitions) {
addPartition(partition.getPartitionKey());
}
partitionConsumerState = result.getPartitionConsumerState();
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class DynamicPartitionerWithAvroTest method runDynamicPartitionerMapReduce.
private void runDynamicPartitionerMapReduce(final List<? extends GenericRecord> records, boolean allowConcurrentWriters, boolean expectedStatus) throws Exception {
ApplicationWithPrograms app = deployApp(AppWithMapReduceUsingAvroDynamicPartitioner.class);
final long now = System.currentTimeMillis();
final Multimap<PartitionKey, GenericRecord> keyToRecordsMap = groupByPartitionKey(records, now);
// write values to the input kvTable
final KeyValueTable kvTable = datasetCache.getDataset(INPUT_DATASET);
Transactions.createTransactionExecutor(txExecutorFactory, kvTable).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
// the keys are not used; it matters that they're unique though
for (int i = 0; i < records.size(); i++) {
kvTable.write(Integer.toString(i), records.get(i).toString());
}
}
});
String allowConcurrencyKey = "dataset." + OUTPUT_DATASET + "." + PartitionedFileSetArguments.DYNAMIC_PARTITIONER_ALLOW_CONCURRENCY;
// run the partition writer m/r with this output partition time
ImmutableMap<String, String> arguments = ImmutableMap.of(OUTPUT_PARTITION_KEY, Long.toString(now), allowConcurrencyKey, Boolean.toString(allowConcurrentWriters));
long startTime = System.currentTimeMillis();
boolean status = runProgram(app, AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.class, new BasicArguments(arguments));
Assert.assertEquals(expectedStatus, status);
if (!expectedStatus) {
// if we expect the program to fail, no need to check the output data for expected results
return;
}
// Verify notifications
List<Notification> notifications = getDataNotifications(startTime);
Assert.assertEquals(1, notifications.size());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(OUTPUT_DATASET), DatasetId.fromString(notifications.get(0).getProperties().get("datasetId")));
// this should have created a partition in the pfs
final PartitionedFileSet pfs = datasetCache.getDataset(OUTPUT_DATASET);
final Location pfsBaseLocation = pfs.getEmbeddedFileSet().getBaseLocation();
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) pfs).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws IOException {
Map<PartitionKey, PartitionDetail> partitions = new HashMap<>();
for (PartitionDetail partition : pfs.getPartitions(null)) {
partitions.put(partition.getPartitionKey(), partition);
// check that the mapreduce wrote the output partition metadata to all the output partitions
Assert.assertEquals(AppWithMapReduceUsingAvroDynamicPartitioner.DynamicPartitioningMapReduce.METADATA, partition.getMetadata().asMap());
}
Assert.assertEquals(3, partitions.size());
Assert.assertEquals(keyToRecordsMap.keySet(), partitions.keySet());
// Check relative paths of the partitions. Also check that their location = pfs baseLocation + relativePath
for (Map.Entry<PartitionKey, PartitionDetail> partitionKeyEntry : partitions.entrySet()) {
PartitionDetail partitionDetail = partitionKeyEntry.getValue();
String relativePath = partitionDetail.getRelativePath();
int zip = (int) partitionKeyEntry.getKey().getField("zip");
Assert.assertEquals(Long.toString(now) + Path.SEPARATOR + zip, relativePath);
Assert.assertEquals(pfsBaseLocation.append(relativePath), partitionDetail.getLocation());
}
for (Map.Entry<PartitionKey, Collection<GenericRecord>> keyToRecordsEntry : keyToRecordsMap.asMap().entrySet()) {
Set<GenericRecord> genericRecords = new HashSet<>(keyToRecordsEntry.getValue());
Assert.assertEquals(genericRecords, readOutput(partitions.get(keyToRecordsEntry.getKey()).getLocation()));
}
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class ConnectorSource method prepareRun.
@Override
public void prepareRun(BatchSourceContext context) throws Exception {
Map<String, String> arguments = new HashMap<>();
PartitionedFileSet inputFileset = context.getDataset(datasetName);
for (PartitionDetail partitionDetail : inputFileset.getPartitions(PartitionFilter.ALWAYS_MATCH)) {
PartitionedFileSetArguments.addInputPartition(arguments, partitionDetail);
}
context.setInput(Input.ofDataset(datasetName, arguments));
}
Aggregations