use of io.cdap.cdap.data2.metadata.writer.MetadataPublisher in project cdap by caskdata.
the class MapReduceTaskContextProvider method createCacheLoader.
/**
* Creates a {@link CacheLoader} for the task context cache.
*/
private CacheLoader<ContextCacheKey, BasicMapReduceTaskContext> createCacheLoader(final Injector injector) {
DiscoveryServiceClient discoveryServiceClient = injector.getInstance(DiscoveryServiceClient.class);
DatasetFramework datasetFramework = injector.getInstance(DatasetFramework.class);
SecureStore secureStore = injector.getInstance(SecureStore.class);
SecureStoreManager secureStoreManager = injector.getInstance(SecureStoreManager.class);
MessagingService messagingService = injector.getInstance(MessagingService.class);
// Multiple instances of BasicMapReduceTaskContext can share the same program.
AtomicReference<Program> programRef = new AtomicReference<>();
MetadataReader metadataReader = injector.getInstance(MetadataReader.class);
MetadataPublisher metadataPublisher = injector.getInstance(MetadataPublisher.class);
FieldLineageWriter fieldLineageWriter = injector.getInstance(FieldLineageWriter.class);
RemoteClientFactory remoteClientFactory = injector.getInstance(RemoteClientFactory.class);
return new CacheLoader<ContextCacheKey, BasicMapReduceTaskContext>() {
@Override
public BasicMapReduceTaskContext load(ContextCacheKey key) throws Exception {
TaskAttemptID taskAttemptId = key.getTaskAttemptID();
// taskAttemptId could be null if used from a org.apache.hadoop.mapreduce.Partitioner or
// from a org.apache.hadoop.io.RawComparator, in which case we can get the JobId from the conf. Note that the
// JobId isn't in the conf for the OutputCommitter#setupJob method, in which case we use the taskAttemptId
Path txFile = MainOutputCommitter.getTxFile(key.getConfiguration(), taskAttemptId != null ? taskAttemptId.getJobID() : null);
FileSystem fs = txFile.getFileSystem(key.getConfiguration());
Transaction transaction = null;
if (fs.exists(txFile)) {
try (FSDataInputStream txFileInputStream = fs.open(txFile)) {
transaction = new TransactionCodec().decode(ByteStreams.toByteArray(txFileInputStream));
}
}
MapReduceContextConfig contextConfig = new MapReduceContextConfig(key.getConfiguration());
MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(key.getConfiguration());
Program program = programRef.get();
if (program == null) {
// Creation of program is relatively cheap, so just create and do compare and set.
programRef.compareAndSet(null, createProgram(contextConfig, classLoader.getProgramClassLoader()));
program = programRef.get();
}
WorkflowProgramInfo workflowInfo = contextConfig.getWorkflowProgramInfo();
DatasetFramework programDatasetFramework = workflowInfo == null ? datasetFramework : NameMappedDatasetFramework.createFromWorkflowProgramInfo(datasetFramework, workflowInfo, program.getApplicationSpecification());
// Setup dataset framework context, if required
if (programDatasetFramework instanceof ProgramContextAware) {
ProgramRunId programRunId = program.getId().run(ProgramRunners.getRunId(contextConfig.getProgramOptions()));
((ProgramContextAware) programDatasetFramework).setContext(new BasicProgramContext(programRunId));
}
MapReduceSpecification spec = program.getApplicationSpecification().getMapReduce().get(program.getName());
MetricsCollectionService metricsCollectionService = null;
MapReduceMetrics.TaskType taskType = null;
String taskId = null;
ProgramOptions options = contextConfig.getProgramOptions();
// from a org.apache.hadoop.io.RawComparator
if (taskAttemptId != null) {
taskId = taskAttemptId.getTaskID().toString();
if (MapReduceMetrics.TaskType.hasType(taskAttemptId.getTaskType())) {
taskType = MapReduceMetrics.TaskType.from(taskAttemptId.getTaskType());
// if this is not for a mapper or a reducer, we don't need the metrics collection service
metricsCollectionService = injector.getInstance(MetricsCollectionService.class);
options = new SimpleProgramOptions(options.getProgramId(), options.getArguments(), new BasicArguments(RuntimeArguments.extractScope("task", taskType.toString().toLowerCase(), contextConfig.getProgramOptions().getUserArguments().asMap())), options.isDebug());
}
}
CConfiguration cConf = injector.getInstance(CConfiguration.class);
TransactionSystemClient txClient = injector.getInstance(TransactionSystemClient.class);
NamespaceQueryAdmin namespaceQueryAdmin = injector.getInstance(NamespaceQueryAdmin.class);
return new BasicMapReduceTaskContext(program, options, cConf, taskType, taskId, spec, workflowInfo, discoveryServiceClient, metricsCollectionService, txClient, transaction, programDatasetFramework, classLoader.getPluginInstantiator(), contextConfig.getLocalizedResources(), secureStore, secureStoreManager, accessEnforcer, authenticationContext, messagingService, mapReduceClassLoader, metadataReader, metadataPublisher, namespaceQueryAdmin, fieldLineageWriter, remoteClientFactory);
}
};
}
use of io.cdap.cdap.data2.metadata.writer.MetadataPublisher in project cdap by caskdata.
the class SparkProgramRunner method run.
@Override
public ProgramController run(Program program, ProgramOptions options) {
LOG.trace("Starting Spark program {} with SparkProgramRunner of ClassLoader {}", program.getId(), getClass().getClassLoader());
// Get the RunId first. It is used for the creation of the ClassLoader closing thread.
Arguments arguments = options.getArguments();
RunId runId = ProgramRunners.getRunId(options);
Deque<Closeable> closeables = new LinkedList<>();
try {
// Extract and verify parameters
ApplicationSpecification appSpec = program.getApplicationSpecification();
Preconditions.checkNotNull(appSpec, "Missing application specification.");
ProgramType processorType = program.getType();
Preconditions.checkNotNull(processorType, "Missing processor type.");
Preconditions.checkArgument(processorType == ProgramType.SPARK, "Only Spark process type is supported.");
SparkSpecification spec = appSpec.getSpark().get(program.getName());
Preconditions.checkNotNull(spec, "Missing SparkSpecification for %s", program.getName());
String host = options.getArguments().getOption(ProgramOptionConstants.HOST);
Preconditions.checkArgument(host != null, "No hostname is provided");
// Get the WorkflowProgramInfo if it is started by Workflow
WorkflowProgramInfo workflowInfo = WorkflowProgramInfo.create(arguments);
DatasetFramework programDatasetFramework = workflowInfo == null ? datasetFramework : NameMappedDatasetFramework.createFromWorkflowProgramInfo(datasetFramework, workflowInfo, appSpec);
// Setup dataset framework context, if required
if (programDatasetFramework instanceof ProgramContextAware) {
ProgramId programId = program.getId();
((ProgramContextAware) programDatasetFramework).setContext(new BasicProgramContext(programId.run(runId)));
}
PluginInstantiator pluginInstantiator = createPluginInstantiator(options, program.getClassLoader());
if (pluginInstantiator != null) {
closeables.addFirst(pluginInstantiator);
}
SparkRuntimeContext runtimeContext = new SparkRuntimeContext(new Configuration(hConf), program, options, cConf, host, txClient, programDatasetFramework, metricsCollectionService, workflowInfo, pluginInstantiator, secureStore, secureStoreManager, accessEnforcer, authenticationContext, messagingService, serviceAnnouncer, pluginFinder, locationFactory, metadataReader, metadataPublisher, namespaceQueryAdmin, fieldLineageWriter, remoteClientFactory, () -> {
});
closeables.addFirst(runtimeContext);
Spark spark;
try {
spark = new InstantiatorFactory(false).get(TypeToken.of(program.<Spark>getMainClass())).create();
} catch (Exception e) {
LOG.error("Failed to instantiate Spark class for {}", spec.getClassName(), e);
throw Throwables.propagate(e);
}
boolean isLocal = SparkRuntimeContextConfig.isLocal(options);
SparkSubmitter submitter;
// If MasterEnvironment is not available, use non-master env spark submitters
MasterEnvironment masterEnv = MasterEnvironments.getMasterEnvironment();
if (masterEnv != null && cConf.getBoolean(Constants.Environment.PROGRAM_SUBMISSION_MASTER_ENV_ENABLED, true)) {
submitter = new MasterEnvironmentSparkSubmitter(cConf, locationFactory, host, runtimeContext, masterEnv);
} else {
submitter = isLocal ? new LocalSparkSubmitter() : new DistributedSparkSubmitter(hConf, locationFactory, host, runtimeContext, options.getArguments().getOption(Constants.AppFabric.APP_SCHEDULER_QUEUE));
}
Service sparkRuntimeService = new SparkRuntimeService(cConf, spark, getPluginArchive(options), runtimeContext, submitter, locationFactory, isLocal, fieldLineageWriter, masterEnv);
sparkRuntimeService.addListener(createRuntimeServiceListener(closeables), Threads.SAME_THREAD_EXECUTOR);
ProgramController controller = new SparkProgramController(sparkRuntimeService, runtimeContext);
LOG.debug("Starting Spark Job. Context: {}", runtimeContext);
if (isLocal || UserGroupInformation.isSecurityEnabled()) {
sparkRuntimeService.start();
} else {
ProgramRunners.startAsUser(cConf.get(Constants.CFG_HDFS_USER), sparkRuntimeService);
}
return controller;
} catch (Throwable t) {
closeAllQuietly(closeables);
throw Throwables.propagate(t);
}
}
use of io.cdap.cdap.data2.metadata.writer.MetadataPublisher in project cdap by caskdata.
the class MapReduceProgramRunner method run.
@Override
public ProgramController run(final Program program, ProgramOptions options) {
// Extract and verify parameters
ApplicationSpecification appSpec = program.getApplicationSpecification();
Preconditions.checkNotNull(appSpec, "Missing application specification.");
ProgramType processorType = program.getType();
Preconditions.checkNotNull(processorType, "Missing processor type.");
Preconditions.checkArgument(processorType == ProgramType.MAPREDUCE, "Only MAPREDUCE process type is supported.");
MapReduceSpecification spec = appSpec.getMapReduce().get(program.getName());
Preconditions.checkNotNull(spec, "Missing MapReduceSpecification for %s", program.getName());
Arguments arguments = options.getArguments();
RunId runId = ProgramRunners.getRunId(options);
WorkflowProgramInfo workflowInfo = WorkflowProgramInfo.create(arguments);
DatasetFramework programDatasetFramework = workflowInfo == null ? datasetFramework : NameMappedDatasetFramework.createFromWorkflowProgramInfo(datasetFramework, workflowInfo, appSpec);
// Setup dataset framework context, if required
if (programDatasetFramework instanceof ProgramContextAware) {
ProgramId programId = program.getId();
((ProgramContextAware) programDatasetFramework).setContext(new BasicProgramContext(programId.run(runId)));
}
MapReduce mapReduce;
try {
mapReduce = new InstantiatorFactory(false).get(TypeToken.of(program.<MapReduce>getMainClass())).create();
} catch (Exception e) {
LOG.error("Failed to instantiate MapReduce class for {}", spec.getClassName(), e);
throw Throwables.propagate(e);
}
// List of all Closeable resources that needs to be cleanup
List<Closeable> closeables = new ArrayList<>();
try {
PluginInstantiator pluginInstantiator = createPluginInstantiator(options, program.getClassLoader());
if (pluginInstantiator != null) {
closeables.add(pluginInstantiator);
}
final BasicMapReduceContext context = new BasicMapReduceContext(program, options, cConf, spec, workflowInfo, discoveryServiceClient, metricsCollectionService, txSystemClient, programDatasetFramework, getPluginArchive(options), pluginInstantiator, secureStore, secureStoreManager, messagingService, metadataReader, metadataPublisher, namespaceQueryAdmin, fieldLineageWriter, remoteClientFactory);
closeables.add(context);
Reflections.visit(mapReduce, mapReduce.getClass(), new PropertyFieldSetter(context.getSpecification().getProperties()), new MetricsFieldSetter(context.getMetrics()), new DataSetFieldSetter(context));
// note: this sets logging context on the thread level
LoggingContextAccessor.setLoggingContext(context.getLoggingContext());
// Set the job queue to hConf if it is provided
Configuration hConf = new Configuration(this.hConf);
String schedulerQueue = options.getArguments().getOption(Constants.AppFabric.APP_SCHEDULER_QUEUE);
if (schedulerQueue != null && !schedulerQueue.isEmpty()) {
hConf.set(JobContext.QUEUE_NAME, schedulerQueue);
}
ClusterMode clusterMode = ProgramRunners.getClusterMode(options);
Service mapReduceRuntimeService = new MapReduceRuntimeService(injector, cConf, hConf, mapReduce, spec, context, program.getJarLocation(), locationFactory, clusterMode, fieldLineageWriter);
mapReduceRuntimeService.addListener(createRuntimeServiceListener(closeables), Threads.SAME_THREAD_EXECUTOR);
ProgramController controller = new MapReduceProgramController(mapReduceRuntimeService, context);
LOG.debug("Starting MapReduce Job: {}", context);
// be running the job, but the data directory will be owned by cdap.
if (MapReduceTaskContextProvider.isLocal(hConf) || UserGroupInformation.isSecurityEnabled()) {
mapReduceRuntimeService.start();
} else {
ProgramRunners.startAsUser(cConf.get(Constants.CFG_HDFS_USER), mapReduceRuntimeService);
}
return controller;
} catch (Exception e) {
closeAllQuietly(closeables);
throw Throwables.propagate(e);
}
}
use of io.cdap.cdap.data2.metadata.writer.MetadataPublisher in project cdap by caskdata.
the class MetadataSubscriberServiceTest method testMetadata.
@Test
public void testMetadata() throws InterruptedException, TimeoutException, ExecutionException, IOException {
ProgramRunId workflowRunId = workflow1.run(RunIds.generate());
MetadataEntity entity = MetadataEntity.ofDataset("myns", "myds");
// Try to read, should have nothing
MetadataStorage metadataStorage = getInjector().getInstance(MetadataStorage.class);
Metadata meta = metadataStorage.read(new Read(entity, MetadataScope.USER));
Assert.assertTrue(meta.getProperties().isEmpty());
Assert.assertTrue(meta.getTags().isEmpty());
MetadataPublisher metadataPublisher = getInjector().getInstance(MessagingMetadataPublisher.class);
final String descriptionKey = MetadataConstants.DESCRIPTION_KEY;
final String creationTimeKey = MetadataConstants.CREATION_TIME_KEY;
// publish a create event
Map<String, String> props = ImmutableMap.of("x", "y", descriptionKey, "desc1", creationTimeKey, "123456");
Set<String> tags = ImmutableSet.of("sometag");
metadataPublisher.publish(NamespaceId.SYSTEM, new MetadataOperation.Create(entity, props, tags));
// wait until meta data is written
waitForSystemMetadata(entity, metadataStorage, 3, 1);
// validate correctness of meta data after create
meta = metadataStorage.read(new Read(entity, MetadataScope.SYSTEM));
Assert.assertEquals(props, meta.getProperties(MetadataScope.SYSTEM));
Assert.assertEquals(tags, meta.getTags(MetadataScope.SYSTEM));
// publish another create event with different create time, no description, different tags
Set<String> tags2 = ImmutableSet.of("another", "two");
metadataPublisher.publish(workflowRunId, new MetadataOperation.Create(entity, ImmutableMap.of(creationTimeKey, "9876543", "new", "prop"), tags2));
// wait until meta data is written
waitForSystemMetadata(entity, metadataStorage, 3, 2);
// validate correctness of meta data: creation time and description unchanged, other new property there
meta = metadataStorage.read(new Read(entity, MetadataScope.SYSTEM));
Assert.assertEquals(ImmutableMap.of(creationTimeKey, "123456", descriptionKey, "desc1", "new", "prop"), meta.getProperties(MetadataScope.SYSTEM));
Assert.assertEquals(tags2, meta.getTags(MetadataScope.SYSTEM));
// publish another create event without create time, different description, no tags
metadataPublisher.publish(workflowRunId, new MetadataOperation.Create(entity, ImmutableMap.of(descriptionKey, "some"), Collections.emptySet()));
// wait until meta data is written
waitForSystemMetadata(entity, metadataStorage, 2, 0);
// validate correctness of meta data: same creation time, updated description and other props and tags
meta = metadataStorage.read(new Read(entity, MetadataScope.SYSTEM));
Assert.assertEquals(ImmutableMap.of(creationTimeKey, "123456", descriptionKey, "some"), meta.getProperties(MetadataScope.SYSTEM));
Assert.assertEquals(Collections.emptySet(), meta.getTags(MetadataScope.SYSTEM));
// publish metadata put
Map<String, String> propertiesToAdd = ImmutableMap.of("a", "x", "b", "z");
Set<String> tagsToAdd = ImmutableSet.of("t1", "t2");
metadataPublisher.publish(workflowRunId, new MetadataOperation.Put(entity, propertiesToAdd, tagsToAdd));
// wait until meta data is written
waitForMetadata(entity, metadataStorage, 2, 2);
// validate correctness of meta data written
meta = metadataStorage.read(new Read(entity, MetadataScope.USER));
Assert.assertEquals(propertiesToAdd, meta.getProperties(MetadataScope.USER));
Assert.assertEquals(tagsToAdd, meta.getTags(MetadataScope.USER));
// publish metadata delete
metadataPublisher.publish(workflowRunId, new MetadataOperation.Delete(entity, Collections.singleton("a"), ImmutableSet.of("t1", "t3")));
// wait until meta data is written
waitForMetadata(entity, metadataStorage, 1, 1);
// validate correctness of meta data after delete
meta = metadataStorage.read(new Read(entity, MetadataScope.USER));
Assert.assertEquals(ImmutableMap.of("b", "z"), meta.getProperties(MetadataScope.USER));
Assert.assertEquals(ImmutableSet.of("t2"), meta.getTags(MetadataScope.USER));
// publish metadata put properties
metadataPublisher.publish(workflowRunId, new MetadataOperation.Put(entity, propertiesToAdd, Collections.emptySet()));
// wait until meta data is written
// one of the property key already exist so for that value will be just overwritten hence size is 2
waitForMetadata(entity, metadataStorage, 2, 1);
// publish metadata put tags
metadataPublisher.publish(workflowRunId, new MetadataOperation.Put(entity, Collections.emptyMap(), tagsToAdd));
// wait until meta data is written
// one of the tags already exists hence size is 2
waitForMetadata(entity, metadataStorage, 2, 2);
// publish delete all properties
metadataPublisher.publish(workflowRunId, new MetadataOperation.DeleteAllProperties(entity));
// wait until meta data is written
waitForMetadata(entity, metadataStorage, 0, 2);
// publish delete all tags
metadataPublisher.publish(workflowRunId, new MetadataOperation.DeleteAllTags(entity));
// wait until meta data is written
waitForMetadata(entity, metadataStorage, 0, 0);
// publish metadata put tags
metadataPublisher.publish(workflowRunId, new MetadataOperation.Put(entity, propertiesToAdd, tagsToAdd));
// wait until meta data is written
waitForMetadata(entity, metadataStorage, 2, 2);
// publish delete all
metadataPublisher.publish(workflowRunId, new MetadataOperation.DeleteAll(entity));
// wait until meta data is written
waitForMetadata(entity, metadataStorage, 0, 0);
// publish metadata put tags
metadataPublisher.publish(workflowRunId, new MetadataOperation.Put(entity, propertiesToAdd, tagsToAdd));
// wait until meta data is written
waitForMetadata(entity, metadataStorage, 2, 2);
// publish drop entity
metadataPublisher.publish(workflowRunId, new MetadataOperation.Drop(entity));
// wait until meta data is deleted
waitForSystemMetadata(entity, metadataStorage, 0, 0);
waitForMetadata(entity, metadataStorage, 0, 0);
}
Aggregations