Examples with PartitionedFileSet - co.cask.cdap.api.dataset.lib.PartitionedFileSet

Example 41 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class ExploreTableManager method generateEnableStatement.

/**
 * Generate a Hive DDL statement to create a Hive table for the given dataset.
 *
 * @param dataset the instantiated dataset
 * @param spec the dataset specification
 * @param datasetId the dataset id
 * @param truncating whether this call to create() is part of a truncate() operation, which is in some
 *                   case implemented using disableExplore() followed by enableExplore()
 *
 * @return a CREATE TABLE statement, or null if the dataset is not explorable
 * @throws UnsupportedTypeException if the dataset is a RecordScannable of a type that is not supported by Hive
 */
@Nullable
private String generateEnableStatement(Dataset dataset, DatasetSpecification spec, DatasetId datasetId, String tableName, boolean truncating) throws UnsupportedTypeException, ExploreException {
    String datasetName = datasetId.getDataset();
    Map<String, String> serdeProperties = ImmutableMap.of(Constants.Explore.DATASET_NAME, datasetId.getDataset(), Constants.Explore.DATASET_NAMESPACE, datasetId.getNamespace());
    // or it must be a FileSet or a PartitionedFileSet with explore enabled in it properties.
    if (dataset instanceof Table) {
        // valid for a table not to have a schema property. this logic should really be in Table
        return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, false);
    }
    if (dataset instanceof ObjectMappedTable) {
        return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, true);
    }
    boolean isRecordScannable = dataset instanceof RecordScannable;
    boolean isRecordWritable = dataset instanceof RecordWritable;
    if (isRecordScannable || isRecordWritable) {
        Type recordType = isRecordScannable ? ((RecordScannable) dataset).getRecordType() : ((RecordWritable) dataset).getRecordType();
        // Use == because that's what same class means.
        if (StructuredRecord.class == recordType) {
            return generateCreateStatementFromSchemaProperty(spec, datasetId, tableName, serdeProperties, true);
        }
        // otherwise, derive the schema from the record type
        LOG.debug("Enabling explore for dataset instance {}", datasetName);
        String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
        return new CreateStatementBuilder(datasetName, databaseName, tableName, shouldEscapeColumns).setSchema(hiveSchemaFor(recordType)).setTableComment("CDAP Dataset").buildWithStorageHandler(DatasetStorageHandler.class.getName(), serdeProperties);
    } else if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
        Map<String, String> properties = spec.getProperties();
        if (FileSetProperties.isExploreEnabled(properties)) {
            LOG.debug("Enabling explore for dataset instance {}", datasetName);
            return generateFileSetCreateStatement(datasetId, dataset, properties, truncating);
        }
    }
    // dataset is not explorable
    return null;
}

Also used : Table(co.cask.cdap.api.dataset.table.Table) ObjectMappedTable(co.cask.cdap.api.dataset.lib.ObjectMappedTable) RecordWritable(co.cask.cdap.api.data.batch.RecordWritable) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) CreateStatementBuilder(co.cask.cdap.explore.table.CreateStatementBuilder) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) RecordScannable(co.cask.cdap.api.data.batch.RecordScannable) Type(java.lang.reflect.Type) DatasetStorageHandler(co.cask.cdap.hive.datasets.DatasetStorageHandler) ObjectMappedTable(co.cask.cdap.api.dataset.lib.ObjectMappedTable) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Nullable(javax.annotation.Nullable)

Example 42 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class ExploreTableManager method generateDisableStatement.

private String generateDisableStatement(DatasetId datasetId, DatasetSpecification spec) throws ExploreException {
    String tableName = tableNaming.getTableName(datasetId, spec.getProperties());
    String databaseName = ExploreProperties.getExploreDatabaseName(spec.getProperties());
    // If table does not exist, nothing to be done
    try {
        exploreService.getTableInfo(datasetId.getNamespace(), databaseName, tableName);
    } catch (TableNotFoundException e) {
        // Ignore exception, since this means table was not found.
        return null;
    }
    try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
        Dataset dataset = datasetInstantiator.getDataset(datasetId);
        try {
            if (dataset instanceof FileSet || dataset instanceof PartitionedFileSet) {
                // do not drop the explore table that dataset is reusing an existing table
                if (FileSetProperties.isUseExisting(spec.getProperties())) {
                    return null;
                }
            }
            return generateDeleteStatement(dataset, databaseName, tableName);
        } finally {
            Closeables.closeQuietly(dataset);
        }
    } catch (IOException e) {
        LOG.error("Exception creating dataset classLoaderProvider for dataset {}.", datasetId, e);
        throw new ExploreException("Exception instantiating dataset " + datasetId);
    }
}

Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) SystemDatasetInstantiator(co.cask.cdap.data.dataset.SystemDatasetInstantiator) Dataset(co.cask.cdap.api.dataset.Dataset) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) IOException(java.io.IOException)

Example 43 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class PartitionConsumerTest method testPartitionConsumingWithPartitionAcceptor.

@Test
public void testPartitionConsumingWithPartitionAcceptor() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    // i will range from [0,10), s will always be 'partitionKeys1'
    final Set<PartitionKey> partitionKeys1 = new HashSet<>();
    for (int i = 0; i < 10; i++) {
        PartitionKey key = PartitionKey.builder().addIntField("i", i).addLongField("l", 17L).addStringField("s", "partitionKeys1").build();
        partitionKeys1.add(key);
    }
    // i will range from [0,15), s will always be 'partitionKeys2'
    final Set<PartitionKey> partitionKeys2 = new HashSet<>();
    for (int i = 0; i < 15; i++) {
        PartitionKey key = PartitionKey.builder().addIntField("i", i).addLongField("l", 17L).addStringField("s", "partitionKeys2").build();
        partitionKeys2.add(key);
    }
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (final PartitionKey partitionKey : partitionKeys1) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
            for (final PartitionKey partitionKey : partitionKeys2) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor());
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            List<Partition> consumedPartitions = new ArrayList<>();
            // specify a PartitionAcceptor that only limits to partitions where 's' field is equal to 'partitionKeys1'
            // so it will get all the partitions in partitionKeys1
            Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(new CustomAcceptor("partitionKeys1")).getPartitions());
            // assert that we consumed all the partitions represented by partitionsKeys1
            Assert.assertEquals(partitionKeys1, toKeys(consumedPartitions));
            consumedPartitions.clear();
            // ask for partitions where 's' field is equal to 'partitionKeys2', but stop iterating upon 'i' field == 8
            Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions(new CustomAcceptor("partitionKeys2", 8)).getPartitions());
            // this will give us 8 of partitionKeys2
            Assert.assertEquals(8, consumedPartitions.size());
            // ask for the remainder of the partitions - i ranging from [8,15). Then, we will have all of 'partitionKeys2'
            Iterables.addAll(consumedPartitions, partitionConsumer.consumePartitions().getPartitions());
            Assert.assertEquals(partitionKeys2, toKeys(consumedPartitions));
        }
    });
}

Also used : ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 44 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class PartitionConsumerTest method testSimpleConcurrency.

@Test
public void testSimpleConcurrency() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    final TransactionAware txAwareDataset = (TransactionAware) dataset;
    final Set<PartitionKey> partitionKeys = new HashSet<>();
    for (int i = 0; i < 10; i++) {
        partitionKeys.add(generateUniqueKey());
    }
    // have ConcurrentPartitionConsumers that share the same state.
    InMemoryStatePersistor persistor = new InMemoryStatePersistor();
    ConsumerConfiguration configuration = ConsumerConfiguration.builder().setMaxRetries(3).build();
    final PartitionConsumer partitionConsumer1 = new ConcurrentPartitionConsumer(dataset, persistor, configuration);
    final PartitionConsumer partitionConsumer2 = new ConcurrentPartitionConsumer(dataset, persistor, configuration);
    final PartitionConsumer partitionConsumer3 = new ConcurrentPartitionConsumer(dataset, persistor, configuration);
    // add all ten keys to the partitioned fileset
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            for (final PartitionKey partitionKey : partitionKeys) {
                dataset.getPartitionOutput(partitionKey).addPartition();
            }
        }
    });
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            // with limit = 1, the returned iterator is only size 1, even though there are more unconsumed partitions
            List<PartitionDetail> consumedBy1 = partitionConsumer1.consumePartitions(1).getPartitions();
            Assert.assertEquals(1, consumedBy1.size());
            // partitionConsumer2 asks for 10 partitions, but 1 is currently in progress by partitionConsumer1, so it only
            // gets the remaining 9 partitions
            List<PartitionDetail> consumedBy2 = partitionConsumer2.consumePartitions(10).getPartitions();
            Assert.assertEquals(9, consumedBy2.size());
            // partitionConsumer3 tries to consume partitions, but all are marked in-progress by partitionConsumer 1 and 2
            Assert.assertEquals(0, partitionConsumer3.consumePartitions().getPartitions().size());
            // partitionConsumer1 aborts its partition, so it then becomes available for partitionConsumer3
            partitionConsumer1.onFinish(consumedBy1, false);
            consumedBy1.clear();
            // queries with limit=2, but only the 1 is available that partitionConsumer1 released
            List<PartitionDetail> consumedBy3 = partitionConsumer3.consumePartitions(2).getPartitions();
            Assert.assertEquals(1, consumedBy3.size());
            // partitionConsumers 2 and 3 marks that it successfully processed the partitions
            partitionConsumer3.onFinish(consumedBy3, true);
            // test onFinishWithKeys API
            List<PartitionKey> keysConsumedBy2 = Lists.transform(consumedBy2, new Function<PartitionDetail, PartitionKey>() {

                @Override
                public PartitionKey apply(PartitionDetail input) {
                    return input.getPartitionKey();
                }
            });
            partitionConsumer2.onFinishWithKeys(keysConsumedBy2, true);
            // at this point, all partitions are processed, so no additional partitions are available for consumption
            Assert.assertEquals(0, partitionConsumer3.consumePartitions().getPartitions().size());
            List<PartitionDetail> allProcessedPartitions = new ArrayList<>();
            allProcessedPartitions.addAll(consumedBy1);
            allProcessedPartitions.addAll(consumedBy2);
            allProcessedPartitions.addAll(consumedBy3);
            // ordering may be different, since all the partitions were added in the same transaction
            Assert.assertEquals(partitionKeys, toKeys(allProcessedPartitions));
        }
    });
}

Also used : ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Function(com.google.common.base.Function) TransactionAware(org.apache.tephra.TransactionAware) ConsumerConfiguration(co.cask.cdap.api.dataset.lib.partitioned.ConsumerConfiguration) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 45 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class PartitionConsumerTest method testPartitionConsumer.

@Test
public void testPartitionConsumer() throws Exception {
    // exercises the edge case of partition consumption, when partitions are being consumed, while another in-progress
    // transaction has added a partition, but it has not yet committed, so the partition is not available for the
    // consumer
    PartitionedFileSet dataset1 = dsFrameworkUtil.getInstance(pfsInstance);
    PartitionedFileSet dataset2 = dsFrameworkUtil.getInstance(pfsInstance);
    TransactionManager txManager = dsFrameworkUtil.getTxManager();
    InMemoryTxSystemClient txClient = new InMemoryTxSystemClient(txManager);
    // producer simply adds initial partition
    TransactionContext txContext1 = new TransactionContext(txClient, (TransactionAware) dataset1);
    txContext1.start();
    PartitionKey partitionKey1 = generateUniqueKey();
    dataset1.getPartitionOutput(partitionKey1).addPartition();
    txContext1.finish();
    // consumer simply consumes initial partition
    TransactionContext txContext2 = new TransactionContext(txClient, (TransactionAware) dataset2);
    txContext2.start();
    PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset2, new InMemoryStatePersistor());
    List<? extends PartitionDetail> partitionIterator = partitionConsumer.consumePartitions().getPartitions();
    Assert.assertEquals(1, partitionIterator.size());
    Assert.assertEquals(partitionKey1, partitionIterator.get(0).getPartitionKey());
    txContext2.finish();
    // producer adds a second partition, but does not yet commit the transaction
    txContext1.start();
    PartitionKey partitionKey2 = generateUniqueKey();
    dataset1.getPartitionOutput(partitionKey2).addPartition();
    // consumer attempts to consume at a time after the partition was added, but before it committed. Because of this,
    // the partition is not visible and will not be consumed
    txContext2.start();
    Assert.assertTrue(partitionConsumer.consumePartitions().getPartitions().isEmpty());
    txContext2.finish();
    // producer commits the transaction in which the second partition was added
    txContext1.finish();
    // the next time the consumer runs, it processes the second partition
    txContext2.start();
    partitionIterator = partitionConsumer.consumePartitions().getPartitions();
    Assert.assertEquals(1, partitionIterator.size());
    Assert.assertEquals(partitionKey2, partitionIterator.get(0).getPartitionKey());
    txContext2.finish();
}

Also used : ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) TransactionManager(org.apache.tephra.TransactionManager) TransactionContext(org.apache.tephra.TransactionContext) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) ConcurrentPartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer) PartitionConsumer(co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer) InMemoryTxSystemClient(org.apache.tephra.inmemory.InMemoryTxSystemClient) Test(org.junit.Test)

Aggregations

PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)65 Test (org.junit.Test)39 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)32 Location (org.apache.twill.filesystem.Location)25 TransactionAware (org.apache.tephra.TransactionAware)24 TransactionExecutor (org.apache.tephra.TransactionExecutor)24 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)18 IOException (java.io.IOException)17 DataSetException (co.cask.cdap.api.dataset.DataSetException)12 FileSet (co.cask.cdap.api.dataset.lib.FileSet)12 HashSet (java.util.HashSet)12 List (java.util.List)12 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)11 PartitionAlreadyExistsException (co.cask.cdap.api.dataset.lib.PartitionAlreadyExistsException)11 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)11 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)11 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)9 ImmutableList (com.google.common.collect.ImmutableList)9 ArrayList (java.util.ArrayList)9 HashMap (java.util.HashMap)9