use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class ExploreTableManager method generateEnableStatement.
/**
* Generate a Hive DDL statement to create a Hive table for the given dataset.
*
* @param dataset the instantiated dataset
* @param spec the dataset specification
* @param datasetId the dataset id
* @param truncating whether this call to create() is part of a truncate() operation, which is in some
* case implemented using disableExplore() followed by enableExplore()
*
* @return a CREATE TABLE statement, or null if the dataset is not explorable
* @throws UnsupportedTypeException if the dataset is a RecordScannable of a type that is not supported by Hive
*/
@Nullable
private String generateEnableStatement(Dataset dataset, DatasetSpecification spec, DatasetId datasetId, String tableName, boolean truncating) throws UnsupportedTypeException, ExploreException {
String datasetName = datasetId.getDataset();
Map<String, String> serdeProperties = ImmutableMap.of(Constants.Explore.DATASET_NAME, datasetId.getDataset(), Constants.Explore.DATASET_NAMESPACE, datasetId.getNamespace());
// or it must be a FileSet or a PartitionedFileSet with explore enabled in it properties.
if (dataset instanceof Table) {
// valid for a table not to have a schema property. this logic should really be in Table
return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, false);
}
if (dataset instanceof ObjectMappedTable) {
return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, true);
}
boolean isRecordScannable = dataset instanceof RecordScannable;
boolean isRecordWritable = dataset instanceof RecordWritable;
if (isRecordScannable || isRecordWritable) {
Type recordType = isRecordScannable ? ((RecordScannable) dataset).getRecordType() : ((RecordWritable) dataset).getRecordType();
// Use == because that's what same class means.
if (StructuredRecord.class == recordType) {
return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, true);
}
// otherwise, derive the schema from the record type
LOG.debug("Enabling explore for dataset instance {}", datasetName);
String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
return new CreateStatementBuilder(datasetName, databaseName, tableName, shouldEscapeColumns).setSchema(hiveSchemaFor(recordType)).setTableComment("CDAP Dataset").buildWithStorageHandler(DatasetStorageHandler.class.getName(), serdeProperties);
} else if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
Map<String, String> properties = spec.getProperties();
if (FileSetProperties.isExploreEnabled(properties)) {
LOG.debug("Enabling explore for dataset instance {}", datasetName);
return generateFileSetCreateStatement(datasetId, dataset, properties, truncating);
}
}
// dataset is not explorable
return null;
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class ExploreTableManager method generateDisableStatement.
private String generateDisableStatement(DatasetId datasetId, DatasetSpecification spec) throws ExploreException {
String tableName = tableNaming.getTableName(datasetId, spec.getProperties());
String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
// If table does not exist, nothing to be done
try {
exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
} catch (TableNotFoundException e) {
// Ignore exception, since this means table was not found.
return null;
}
try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
Dataset dataset = datasetInstantiator.getDataset(datasetId);
try {
if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
// do not drop the explore table that dataset is reusing an existing table
if (FileSetProperties.isUseExisting(spec.getProperties())) {
return null;
}
}
return generateDeleteStatement(dataset, databaseName, tableName);
} finally {
Closeables.closeQuietly(dataset);
}
} catch (IOException e) {
LOG.error("Exception creating dataset classLoaderProvider for dataset {}.", datasetId, e);
throw new ExploreException("Exception instantiating dataset " + datasetId);
}
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionConsumerTest method testPartitionConsumingWithPartitionAcceptor.
@Test
public void testPartitionConsumingWithPartitionAcceptor() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
// i will range from [0,10), s will always be 'partitionKeys1'
final Set<PartitionKey> partitionKeys1 = new HashSet<>();
for (int i = 0; i < 10; i++) {
PartitionKey key = PartitionKey.builder().addIntField("i", i).addLongField("l", 17L).addStringField("s", "partitionKeys1").build();
partitionKeys1.add(key);
}
// i will range from [0,15), s will always be 'partitionKeys2'
final Set<PartitionKey> partitionKeys2 = new HashSet<>();
for (int i = 0; i < 15; i++) {
PartitionKey key = PartitionKey.builder().addIntField("i", i).addLongField("l", 17L).addStringField("s", "partitionKeys2").build();
partitionKeys2.add(key);
}
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (final PartitionKey partitionKey : partitionKeys1) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
for (final PartitionKey partitionKey : partitionKeys2) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor());
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
List<Partition> consumedPartitions = new ArrayList<>();
// specify a PartitionAcceptor that only limits to partitions where 's' field is equal to 'partitionKeys1'
// so it will get all the partitions in partitionKeys1
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(new CustomAcceptor("partitionKeys1")).getPartitions());
// assert that we consumed all the partitions represented by partitionsKeys1
Assert.assertEquals(partitionKeys1, toKeys(consumedPartitions));
consumedPartitions.clear();
// ask for partitions where 's' field is equal to 'partitionKeys2', but stop iterating upon 'i' field == 8
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(new CustomAcceptor("partitionKeys2", 8)).getPartitions());
// this will give us 8 of partitionKeys2
Assert.assertEquals(8, consumedPartitions.size());
// ask for the remainder of the partitions - i ranging from [8,15). Then, we will have all of 'partitionKeys2'
Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions().getPartitions());
Assert.assertEquals(partitionKeys2, toKeys(consumedPartitions));
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionConsumerTest method testSimpleConcurrency.
@Test
public void testSimpleConcurrency() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
final Set<PartitionKey> partitionKeys = new HashSet<>();
for (int i = 0; i < 10; i++) {
partitionKeys.add(generateUniqueKey());
}
// have ConcurrentPartitionConsumers that share the same state.
InMemoryStatePersistor persistor = new InMemoryStatePersistor();
ConsumerConfiguration configuration = ConsumerConfiguration.builder().setMaxRetries(3).build();
final PartitionConsumer partitionConsumer1 = new ConcurrentPartitionConsumer(dataset, persistor, configuration);
final PartitionConsumer partitionConsumer2 = new ConcurrentPartitionConsumer(dataset, persistor, configuration);
final PartitionConsumer partitionConsumer3 = new ConcurrentPartitionConsumer(dataset, persistor, configuration);
// add all ten keys to the partitioned fileset
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
for (final PartitionKey partitionKey : partitionKeys) {
dataset.getPartitionOutput(partitionKey).addPartition();
}
}
});
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// with limit = 1, the returned iterator is only size 1, even though there are more unconsumed partitions
List<PartitionDetail> consumedBy1 = partitionConsumer1.consumePartitions(1).getPartitions();
Assert.assertEquals(1, consumedBy1.size());
// partitionConsumer2 asks for 10 partitions, but 1 is currently in progress by partitionConsumer1, so it only
// gets the remaining 9 partitions
List<PartitionDetail> consumedBy2 = partitionConsumer2.consumePartitions(10).getPartitions();
Assert.assertEquals(9, consumedBy2.size());
// partitionConsumer3 tries to consume partitions, but all are marked in-progress by partitionConsumer 1 and 2
Assert.assertEquals(0, partitionConsumer3.consumePartitions().getPartitions().size());
// partitionConsumer1 aborts its partition, so it then becomes available for partitionConsumer3
partitionConsumer1.onFinish(consumedBy1, false);
consumedBy1.clear();
// queries with limit=2, but only the 1 is available that partitionConsumer1 released
List<PartitionDetail> consumedBy3 = partitionConsumer3.consumePartitions(2).getPartitions();
Assert.assertEquals(1, consumedBy3.size());
// partitionConsumers 2 and 3 marks that it successfully processed the partitions
partitionConsumer3.onFinish(consumedBy3, true);
// test onFinishWithKeys API
List<PartitionKey> keysConsumedBy2 = Lists.transform(consumedBy2, new Function<PartitionDetail, PartitionKey>() {
@Override
public PartitionKey apply(PartitionDetail input) {
return input.getPartitionKey();
}
});
partitionConsumer2.onFinishWithKeys(keysConsumedBy2, true);
// at this point, all partitions are processed, so no additional partitions are available for consumption
Assert.assertEquals(0, partitionConsumer3.consumePartitions().getPartitions().size());
List<PartitionDetail> allProcessedPartitions = new ArrayList<>();
allProcessedPartitions.addAll(consumedBy1);
allProcessedPartitions.addAll(consumedBy2);
allProcessedPartitions.addAll(consumedBy3);
// ordering may be different, since all the partitions were added in the same transaction
Assert.assertEquals(partitionKeys, toKeys(allProcessedPartitions));
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionConsumerTest method testPartitionConsumer.
@Test
public void testPartitionConsumer() throws Exception {
// exercises the edge case of partition consumption, when partitions are being consumed, while another in-progress
// transaction has added a partition, but it has not yet committed, so the partition is not available for the
// consumer
PartitionedFileSet dataset1 = dsFrameworkUtil.getInstance(pfsInstance);
PartitionedFileSet dataset2 = dsFrameworkUtil.getInstance(pfsInstance);
TransactionManager txManager = dsFrameworkUtil.getTxManager();
InMemoryTxSystemClient txClient = new InMemoryTxSystemClient(txManager);
// producer simply adds initial partition
TransactionContext txContext1 = new TransactionContext(txClient, (TransactionAware) dataset1);
txContext1.start();
PartitionKey partitionKey1 = generateUniqueKey();
dataset1.getPartitionOutput(partitionKey1).addPartition();
txContext1.finish();
// consumer simply consumes initial partition
TransactionContext txContext2 = new TransactionContext(txClient, (TransactionAware) dataset2);
txContext2.start();
PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset2, new InMemoryStatePersistor());
List<? extends PartitionDetail> partitionIterator = partitionConsumer.consumePartitions().getPartitions();
Assert.assertEquals(1, partitionIterator.size());
Assert.assertEquals(partitionKey1, partitionIterator.get(0).getPartitionKey());
txContext2.finish();
// producer adds a second partition, but does not yet commit the transaction
txContext1.start();
PartitionKey partitionKey2 = generateUniqueKey();
dataset1.getPartitionOutput(partitionKey2).addPartition();
// consumer attempts to consume at a time after the partition was added, but before it committed. Because of this,
// the partition is not visible and will not be consumed
txContext2.start();
Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
txContext2.finish();
// producer commits the transaction in which the second partition was added
txContext1.finish();
// the next time the consumer runs, it processes the second partition
txContext2.start();
partitionIterator = partitionConsumer.consumePartitions().getPartitions();
Assert.assertEquals(1, partitionIterator.size());
Assert.assertEquals(partitionKey2, partitionIterator.get(0).getPartitionKey());
txContext2.finish();
}
Aggregations