Search in sources :

Example 16 with Dataset

use of io.cdap.cdap.api.dataset.Dataset in project cdap by cdapio.

the class DatasetModulesDeployer method loadAndDeployModule.

private void loadAndDeployModule(ClassLoader artifactClassLoader, String className, final Location jarLocation, String moduleName, NamespaceId namespaceId, String authorizingUser) throws Exception {
    // note: using app class loader to load module class
    @SuppressWarnings("unchecked") Class<Dataset> clazz = (Class<Dataset>) artifactClassLoader.loadClass(className);
    try {
        // note: we can deploy module or create module from Dataset class
        // note: it seems dangerous to instantiate dataset module here, but this will be fine when we move deploy into
        // isolated user's environment (e.g. separate yarn container)
        final DatasetModuleId moduleId = namespaceId.datasetModule(moduleName);
        final DatasetModule module;
        if (DatasetModule.class.isAssignableFrom(clazz)) {
            module = (DatasetModule) clazz.newInstance();
        } else if (Dataset.class.isAssignableFrom(clazz)) {
            if (systemDatasetFramework.hasSystemType(clazz.getName())) {
                return;
            }
            final DatasetTypeId typeId = namespaceId.datasetType(clazz.getName());
            boolean hasType = AuthorizationUtil.authorizeAs(authorizingUser, new Callable<Boolean>() {

                @Override
                public Boolean call() throws Exception {
                    return datasetFramework.hasType(typeId);
                }
            });
            if (hasType && !allowDatasetUncheckedUpgrade) {
                return;
            }
            module = new SingleTypeModule(clazz);
        } else {
            throw new IllegalArgumentException(String.format("Cannot use class %s to add dataset module: it must be of type DatasetModule or Dataset", clazz.getName()));
        }
        LOG.info("Adding module: {}", clazz.getName());
        AuthorizationUtil.authorizeAs(authorizingUser, new Callable<Void>() {

            @Override
            public Void call() throws Exception {
                datasetFramework.addModule(moduleId, module, jarLocation);
                return null;
            }
        });
    } catch (ModuleConflictException e) {
        LOG.info("Conflict while deploying module {}: {}", moduleName, e.getMessage());
        throw e;
    }
}
Also used : DatasetTypeId(io.cdap.cdap.proto.id.DatasetTypeId) Dataset(io.cdap.cdap.api.dataset.Dataset) DatasetModule(io.cdap.cdap.api.dataset.module.DatasetModule) Callable(java.util.concurrent.Callable) DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) ModuleConflictException(io.cdap.cdap.data2.dataset2.ModuleConflictException) DatasetModuleId(io.cdap.cdap.proto.id.DatasetModuleId) ModuleConflictException(io.cdap.cdap.data2.dataset2.ModuleConflictException) SingleTypeModule(io.cdap.cdap.data2.dataset2.SingleTypeModule)

Example 17 with Dataset

use of io.cdap.cdap.api.dataset.Dataset in project cdap by cdapio.

the class ExternalDatasets method registerLineage.

/**
 * Register lineage for this Spark program using the given reference name
 *
 * @param referenceName reference name used for source
 * @param accessType the access type of the lineage
 * @throws DatasetManagementException thrown if there was an error in creating reference dataset
 */
public static void registerLineage(Admin admin, String referenceName, AccessType accessType, @Nullable Schema schema, Supplier<Dataset> datasetSupplier) throws DatasetManagementException {
    DatasetProperties datasetProperties;
    if (schema == null) {
        datasetProperties = DatasetProperties.EMPTY;
    } else {
        datasetProperties = DatasetProperties.of(Collections.singletonMap(DatasetProperties.SCHEMA, schema.toString()));
    }
    try {
        if (!admin.datasetExists(referenceName)) {
            admin.createDataset(referenceName, EXTERNAL_DATASET_TYPE, datasetProperties);
        }
    } catch (InstanceConflictException ex) {
    // Might happen if this is executed in parallel across multiple pipeline runs.
    }
    // we cannot instantiate ExternalDataset here - it is in CDAP data-fabric,
    // and this code (the pipeline app) cannot depend on that. Thus, use reflection
    // to invoke a method on the dataset.
    Dataset ds = datasetSupplier.get();
    Class<? extends Dataset> dsClass = ds.getClass();
    switch(accessType) {
        case READ:
            invokeMethod(referenceName, ds, dsClass, "recordRead", accessType);
            break;
        case WRITE:
            invokeMethod(referenceName, ds, dsClass, "recordWrite", accessType);
            break;
        default:
            LOG.warn("Failed to register lineage because of unknown access type {}", accessType);
    }
}
Also used : InstanceConflictException(io.cdap.cdap.api.dataset.InstanceConflictException) Dataset(io.cdap.cdap.api.dataset.Dataset) DatasetProperties(io.cdap.cdap.api.dataset.DatasetProperties)

Example 18 with Dataset

use of io.cdap.cdap.api.dataset.Dataset in project cdap by cdapio.

the class ExternalDatasets method makeTrackable.

/**
 * If the input is an external source then an external dataset is created for tracking purpose and returned.
 * If the input is a regular dataset or a stream then it is already trackable, hence same input is returned.
 *
 * @param admin {@link Admin} used to create external dataset
 * @param input input to be tracked
 * @return an external dataset if input is an external source, otherwise the same input that is passed-in is returned
 */
public static Input makeTrackable(Admin admin, Input input) {
    // If input is not an external source, return the same input as it can be tracked by itself.
    if (!(input instanceof Input.InputFormatProviderInput)) {
        return input;
    }
    // Input is an external source, create an external dataset so that it can be tracked.
    String inputName = input.getName();
    InputFormatProvider inputFormatProvider = ((Input.InputFormatProviderInput) input).getInputFormatProvider();
    Map<String, String> inputFormatConfiguration = inputFormatProvider.getInputFormatConfiguration();
    // this too can be tracked by itself without creating an external dataset
    if (inputFormatProvider instanceof Dataset) {
        return input;
    }
    try {
        // Create an external dataset for the input format for lineage tracking
        Map<String, String> arguments = new HashMap<>();
        arguments.put("input.format.class", inputFormatProvider.getInputFormatClassName());
        arguments.putAll(inputFormatConfiguration);
        if (!admin.datasetExists(inputName)) {
            // Note: the dataset properties are the same as the arguments since we cannot identify them separately
            // since they are mixed up in a single configuration object (CDAP-5674)
            // Also, the properties of the external dataset created will contain runtime arguments for the same reason.
            admin.createDataset(inputName, EXTERNAL_DATASET_TYPE, DatasetProperties.of(arguments));
        } else {
            // Check if the external dataset name clashes with an existing CDAP Dataset
            String datasetType = admin.getDatasetType(inputName);
            if (!EXTERNAL_DATASET_TYPE.equals(datasetType)) {
                throw new IllegalArgumentException("An external source cannot have the same name as an existing CDAP Dataset instance " + inputName);
            }
        }
        return Input.ofDataset(inputName, Collections.unmodifiableMap(arguments)).alias(input.getAlias());
    } catch (DatasetManagementException e) {
        throw Throwables.propagate(e);
    }
}
Also used : DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) InputFormatProvider(io.cdap.cdap.api.data.batch.InputFormatProvider) HashMap(java.util.HashMap) Dataset(io.cdap.cdap.api.dataset.Dataset)

Example 19 with Dataset

use of io.cdap.cdap.api.dataset.Dataset in project cdap by cdapio.

the class ExternalDatasets method makeTrackable.

/**
 * If the output is an external sink then an external dataset is created for tracking purpose and returned.
 * If the output is a regular dataset then it is already trackable, hence same output is returned.
 *
 * @param admin {@link Admin} used to create external dataset
 * @param output output to be tracked
 * @return an external dataset if output is an external sink, otherwise the same output is returned
 */
public static Output makeTrackable(Admin admin, Output output) {
    // If output is not an external sink, return the same output as it can be tracked by itself.
    if (!(output instanceof Output.OutputFormatProviderOutput)) {
        return output;
    }
    // Output is an external sink, create an external dataset so that it can be tracked.
    String outputName = output.getName();
    OutputFormatProvider outputFormatProvider = ((Output.OutputFormatProviderOutput) output).getOutputFormatProvider();
    Map<String, String> outputFormatConfiguration = outputFormatProvider.getOutputFormatConfiguration();
    // this can be tracked by itself without creating an external dataset
    if (outputFormatProvider instanceof Dataset) {
        return output;
    }
    // Output is an external sink, create an external dataset so that it can be tracked.
    try {
        // Create an external dataset for the output format for lineage tracking
        Map<String, String> arguments = new HashMap<>();
        arguments.put("output.format.class", outputFormatProvider.getOutputFormatClassName());
        arguments.putAll(outputFormatConfiguration);
        if (!admin.datasetExists(outputName)) {
            // Note: the dataset properties are the same as the arguments since we cannot identify them separately
            // since they are mixed up in a single configuration object (CDAP-5674)
            // Also, the properties of the external dataset created will contain runtime arguments for the same reason.
            admin.createDataset(outputName, EXTERNAL_DATASET_TYPE, DatasetProperties.of(arguments));
        } else {
            // Check if the external dataset name clashes with an existing CDAP Dataset
            String datasetType = admin.getDatasetType(outputName);
            if (!EXTERNAL_DATASET_TYPE.equals(datasetType)) {
                throw new IllegalArgumentException("An external sink cannot have the same name as an existing CDAP Dataset instance " + outputName);
            }
        }
        return Output.ofDataset(outputName, Collections.unmodifiableMap(arguments)).alias(output.getAlias());
    } catch (DatasetManagementException e) {
        throw Throwables.propagate(e);
    }
}
Also used : DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) HashMap(java.util.HashMap) Dataset(io.cdap.cdap.api.dataset.Dataset) OutputFormatProvider(io.cdap.cdap.api.data.batch.OutputFormatProvider)

Example 20 with Dataset

use of io.cdap.cdap.api.dataset.Dataset in project cdap by cdapio.

the class DatasetAdminService method computeSystemMetadata.

private SystemMetadata computeSystemMetadata(DatasetId datasetInstanceId, final DatasetSpecification spec, DatasetProperties props, final DatasetTypeMeta typeMeta, final DatasetType type, final DatasetContext context, boolean existing, UserGroupInformation ugi) throws IOException {
    // add system metadata for user datasets only
    if (DatasetsUtil.isUserDataset(datasetInstanceId)) {
        Dataset dataset = null;
        try {
            try {
                dataset = ImpersonationUtils.doAs(ugi, () -> type.getDataset(context, spec, DatasetDefinition.NO_ARGUMENTS));
            } catch (Exception e) {
                LOG.warn("Exception while instantiating Dataset {}", datasetInstanceId, e);
            }
            // Make sure to write whatever system metadata that can be derived
            // even if the above instantiation throws exception
            DatasetSystemMetadataProvider metadataProvider;
            if (existing) {
                metadataProvider = new DatasetSystemMetadataProvider(datasetInstanceId, props, dataset, typeMeta.getName(), spec.getDescription());
            } else {
                long createTime = System.currentTimeMillis();
                metadataProvider = new DatasetSystemMetadataProvider(datasetInstanceId, props, createTime, dataset, typeMeta.getName(), spec.getDescription());
            }
            return new SystemMetadata(metadataProvider.getSystemPropertiesToAdd(), metadataProvider.getSystemTagsToAdd(), metadataProvider.getSchemaToAdd());
        } finally {
            if (dataset != null) {
                dataset.close();
            }
        }
    }
    return SystemMetadata.EMPTY;
}
Also used : Dataset(io.cdap.cdap.api.dataset.Dataset) DatasetSystemMetadataProvider(io.cdap.cdap.data2.metadata.system.DatasetSystemMetadataProvider) SystemMetadata(io.cdap.cdap.data2.metadata.system.SystemMetadata) IncompatibleUpdateException(io.cdap.cdap.api.dataset.IncompatibleUpdateException) AccessException(io.cdap.cdap.api.security.AccessException) IOException(java.io.IOException) BadRequestException(io.cdap.cdap.common.BadRequestException) NotFoundException(io.cdap.cdap.common.NotFoundException)

Aggregations

Dataset (io.cdap.cdap.api.dataset.Dataset)40 IOException (java.io.IOException)20 DatasetInstantiationException (io.cdap.cdap.api.data.DatasetInstantiationException)10 DatasetManagementException (io.cdap.cdap.api.dataset.DatasetManagementException)10 SystemDatasetInstantiator (io.cdap.cdap.data.dataset.SystemDatasetInstantiator)8 UnauthorizedException (io.cdap.cdap.security.spi.authorization.UnauthorizedException)8 UnsupportedTypeException (io.cdap.cdap.api.data.schema.UnsupportedTypeException)4 DatasetSpecification (io.cdap.cdap.api.dataset.DatasetSpecification)4 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)4 MeteredDataset (io.cdap.cdap.api.dataset.metrics.MeteredDataset)4 TopicNotFoundException (io.cdap.cdap.api.messaging.TopicNotFoundException)4 BadRequestException (io.cdap.cdap.common.BadRequestException)4 CustomDatasetApp (io.cdap.cdap.data2.dataset2.customds.CustomDatasetApp)4 CustomOperations (io.cdap.cdap.data2.dataset2.customds.CustomOperations)4 DefaultTopLevelExtendsDataset (io.cdap.cdap.data2.dataset2.customds.DefaultTopLevelExtendsDataset)4 DelegatingDataset (io.cdap.cdap.data2.dataset2.customds.DelegatingDataset)4 TopLevelDataset (io.cdap.cdap.data2.dataset2.customds.TopLevelDataset)4 TopLevelDirectDataset (io.cdap.cdap.data2.dataset2.customds.TopLevelDirectDataset)4 TopLevelExtendsDataset (io.cdap.cdap.data2.dataset2.customds.TopLevelExtendsDataset)4 ByteCodeClassLoader (io.cdap.cdap.internal.asm.ByteCodeClassLoader)4