use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class PartitionedFileSetDataset method consumePartitions.
// PartitionConsumerState consists of two things:
// 1) A list of transaction IDs representing the list of transactions in progress during the previous call.
// Each of these transaction IDs need to be checked for new partitions because there may be partitions created by
// those partitions since the previous call.
// 2) A transaction ID from which to start scanning for new partitions. This is an exclusive end range that the
// previous call stopped scanning partitions at.
// Note that each of the transactions IDs in (1) will be smaller than the transactionId in (2).
@ReadWrite
@Override
public PartitionConsumerResult consumePartitions(PartitionConsumerState partitionConsumerState, int limit, Predicate<PartitionDetail> predicate) {
List<Long> previousInProgress = partitionConsumerState.getVersionsToCheck();
Set<Long> noLongerInProgress = setDiff(previousInProgress, tx.getInProgress());
List<PartitionDetail> partitions = Lists.newArrayList();
Iterator<Long> iter = noLongerInProgress.iterator();
while (iter.hasNext()) {
Long txId = iter.next();
if (partitions.size() >= limit) {
break;
}
try (Scanner scanner = partitionsTable.readByIndex(WRITE_PTR_COL, Bytes.toBytes(txId))) {
scannerToPartitions(scanner, partitions, limit, predicate);
}
// remove the txIds as they are added to the partitions list already
// if they're not removed, they will be persisted in the state for the next scan
iter.remove();
}
// exclusive scan end, to be used as the start for a next call to consumePartitions
long scanUpTo;
if (partitions.size() < limit) {
// no read your own writes (partitions)
scanUpTo = Math.min(tx.getWritePointer(), tx.getReadPointer() + 1);
Long endTxId;
try (Scanner scanner = partitionsTable.scanByIndex(WRITE_PTR_COL, Bytes.toBytes(partitionConsumerState.getStartVersion()), Bytes.toBytes(scanUpTo))) {
endTxId = scannerToPartitions(scanner, partitions, limit, predicate);
}
if (endTxId != null) {
// nonnull means that the scanner was not exhausted
scanUpTo = endTxId;
}
} else {
// if we have already hit the limit, don't scan; instead, use the startVersion as the startVersion to the next
// call to consumePartitions
scanUpTo = partitionConsumerState.getStartVersion();
}
List<Long> inProgressBeforeScanEnd = Lists.newArrayList(noLongerInProgress);
for (long txId : tx.getInProgress()) {
if (txId >= scanUpTo) {
break;
}
inProgressBeforeScanEnd.add(txId);
}
return new PartitionConsumerResult(new PartitionConsumerState(scanUpTo, inProgressBeforeScanEnd), partitions);
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class PartitionedFileSetDataset method dropPartition.
@WriteOnly
@Override
public void dropPartition(PartitionKey key) {
byte[] rowKey = generateRowKey(key, partitioning);
PartitionDetail partition = getPartition(key);
if (partition == null) {
// silently ignore non-existing partitions
return;
}
// TODO: make DDL operations transactional [CDAP-1393]
dropPartitionFromExplore(key);
partitionsTable.delete(rowKey);
if (!isExternal) {
Location partitionLocation = partition.getLocation();
try {
if (partitionLocation.exists()) {
Location dstLocation = getQuarantineLocation().append(partition.getRelativePath());
Location dstParent = Locations.getParent(dstLocation);
// shouldn't be null, since dstLocation was created by appending to a location, so it must have a parent
Preconditions.checkNotNull(dstParent);
// before moving into quarantine, we need to ensure that parent location exists
if (!dstParent.exists()) {
if (!dstParent.mkdirs()) {
throw new DataSetException(String.format("Failed to create parent directory %s", dstParent));
}
}
partitionLocation.renameTo(dstLocation);
}
} catch (IOException ioe) {
throw new DataSetException(String.format("Failed to move location %s into quarantine", partitionLocation));
}
operationsInThisTx.add(new DropPartitionOperation(key, partition.getRelativePath()));
}
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class SportResultsTest method testPartitionedCounting.
@Test
public void testPartitionedCounting() throws Exception {
// deploy the application and start the upload service
ApplicationManager appManager = deployApplication(SportResults.class);
ServiceManager serviceManager = appManager.getServiceManager("UploadService").start();
serviceManager.waitForStatus(true);
// upload a few dummy results
URL url = serviceManager.getServiceURL();
uploadResults(url, "fantasy", 2014, FANTASY_2014);
uploadResults(url, "fantasy", 2015, FANTASY_2015);
uploadResults(url, "critters", 2014, CRITTERS_2014);
// start a map/reduce that counts all seasons for the fantasy league
MapReduceManager mrManager = appManager.getMapReduceManager("ScoreCounter").start(ImmutableMap.of("league", "fantasy"));
// should be much faster, though
mrManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// validate the output by reading directly from the file set
DataSetManager<PartitionedFileSet> dataSetManager = getDataset("totals");
PartitionedFileSet totals = dataSetManager.get();
PartitionDetail partitionDetail = totals.getPartition(PartitionKey.builder().addStringField("league", "fantasy").build());
Assert.assertNotNull(partitionDetail);
Location location = partitionDetail.getLocation();
// find the part file that has the actual results
Assert.assertTrue(location.isDirectory());
for (Location file : location.list()) {
if (file.getName().startsWith("part")) {
location = file;
}
}
BufferedReader reader = new BufferedReader(new InputStreamReader(location.getInputStream(), "UTF-8"));
// validate each line
Map<String, String[]> expected = ImmutableMap.of("My Team", new String[] { "My Team", "2", "0", "1", "53", "65" }, "Your Team", new String[] { "Your Team", "1", "0", "2", "63", "60" }, "Other Team", new String[] { "Other Team", "1", "0", "1", "40", "31" });
while (true) {
String line = reader.readLine();
if (line == null) {
break;
}
String[] fields = line.split(",");
Assert.assertArrayEquals(expected.get(fields[0]), fields);
}
// verify using SQL
// query with SQL
Connection connection = getQueryClient();
ResultSet results = connection.prepareStatement("SELECT wins, ties, losses, scored, conceded " + "FROM totals WHERE team = 'My Team' AND league = 'fantasy'").executeQuery();
// should return only one row, with correct time fields
Assert.assertTrue(results.next());
Assert.assertEquals(2, results.getInt(1));
Assert.assertEquals(0, results.getInt(2));
Assert.assertEquals(1, results.getInt(3));
Assert.assertEquals(53, results.getInt(4));
Assert.assertEquals(65, results.getInt(5));
Assert.assertFalse(results.next());
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.
private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
PartitionedFileSet pfs = pfsManager.get();
PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
Location location = partitionOutput.getLocation();
prepareFileInput(location);
partitionOutput.addPartition();
pfsManager.flush();
Map<String, String> inputArgs = new HashMap<>();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
Map<String, String> outputArgs = new HashMap<>();
final PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
args.put("input", "pfs");
args.put("output", "pfs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
pfsManager.flush();
PartitionDetail partition = pfs.getPartition(outputKey);
Assert.assertNotNull(partition);
validateFileOutput(partition.getLocation());
// Cleanup after test completed
location.delete(true);
partition.getLocation().delete(true);
pfs.dropPartition(partitionOutput.getPartitionKey());
pfs.dropPartition(partition.getPartitionKey());
pfsManager.flush();
}
use of co.cask.cdap.api.dataset.lib.PartitionDetail in project cdap by caskdata.
the class FileUploadServiceTestRun method testFileUploadService.
@Test
public void testFileUploadService() throws Exception {
ApplicationManager appManager = deployApplication(FileUploadApp.class);
// Start the service
ServiceManager serviceManager = appManager.getServiceManager(FileUploadApp.SERVICE_NAME).start();
try {
// Upload URL is "base/upload/pfs/[partition_value], which the partition value is a long
URI serviceURI = serviceManager.getServiceURL(10, TimeUnit.SECONDS).toURI();
// Upload with wrong MD5, should get 400.
byte[] content = Strings.repeat("0123456789 ", 100).getBytes(Charsets.UTF_8);
Assert.assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, "123", 30));
long beforeUploadTime = System.currentTimeMillis();
// Upload with right MD5, should get 200
Assert.assertEquals(HttpURLConnection.HTTP_OK, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, BaseEncoding.base64().encode(Hashing.md5().hashBytes(content).asBytes()), 20));
// Inspect the partitioned file set and verify the content
PartitionedFileSet pfs = (PartitionedFileSet) getDataset(FileUploadApp.PFS_NAME).get();
PartitionDetail partition = pfs.getPartition(PartitionKey.builder().addLongField("time", 1).build());
Assert.assertNotNull(partition);
// Verify a notification should have been published for the new partition
List<Notification> notifications = getDataNotifications(beforeUploadTime);
// Should have one message
Assert.assertEquals(1, notifications.size());
verifyDataNotification(notifications.get(0), NamespaceId.DEFAULT.dataset(FileUploadApp.PFS_NAME), Collections.singletonList(PartitionKey.builder().addLongField("time", 1L).build()));
// There should be one file under the partition directory
List<Location> locations = partition.getLocation().list();
Assert.assertEquals(1, locations.size());
Assert.assertArrayEquals(content, ByteStreams.toByteArray(Locations.newInputSupplier(locations.get(0))));
// Verify the tracking table of chunks sizes
KeyValueTable trackingTable = (KeyValueTable) getDataset(FileUploadApp.KV_TABLE_NAME).get();
CloseableIterator<KeyValue<byte[], byte[]>> iter = trackingTable.scan(null, null);
// Sum up all chunks sizes as being tracked by the tracking table.
long sum = 0;
int iterSize = 0;
while (iter.hasNext()) {
KeyValue<byte[], byte[]> kv = iter.next();
sum += Bytes.toInt(kv.getKey()) * Bytes.toLong(kv.getValue());
iterSize++;
}
// The iterator should have size >= 2, since we uses different chunk size for two different upload
Assert.assertTrue(iterSize >= 2);
// The sum of all chunks sizes should be the same as the
// content size * 2 (since we have one failure, one success upload)
Assert.assertEquals(content.length * 2, sum);
} finally {
serviceManager.stop();
}
}
Aggregations