use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.
the class TestMRInputHelpers method testInputSplitLocalResourceCreation.
@Test(timeout = 5000)
public void testInputSplitLocalResourceCreation() throws Exception {
DataSourceDescriptor dataSource = generateDataSourceDescriptorMapRed(oldSplitsDir);
Map<String, LocalResource> localResources = dataSource.getAdditionalLocalFiles();
Assert.assertEquals(2, localResources.size());
Assert.assertTrue(localResources.containsKey(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME));
Assert.assertTrue(localResources.containsKey(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME));
}
use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.
the class TestMRInput method testAttributesInJobConf.
@Test(timeout = 5000)
public void testAttributesInJobConf() throws Exception {
InputContext inputContext = mock(InputContext.class);
doReturn(TEST_ATTRIBUTES_DAG_INDEX).when(inputContext).getDagIdentifier();
doReturn(TEST_ATTRIBUTES_VERTEX_INDEX).when(inputContext).getTaskVertexIndex();
doReturn(TEST_ATTRIBUTES_TASK_INDEX).when(inputContext).getTaskIndex();
doReturn(TEST_ATTRIBUTES_TASK_ATTEMPT_INDEX).when(inputContext).getTaskAttemptNumber();
doReturn(TEST_ATTRIBUTES_INPUT_INDEX).when(inputContext).getInputIndex();
doReturn(TEST_ATTRIBUTES_DAG_ATTEMPT_NUMBER).when(inputContext).getDAGAttemptNumber();
doReturn(TEST_ATTRIBUTES_DAG_NAME).when(inputContext).getDAGName();
doReturn(TEST_ATTRIBUTES_VERTEX_NAME).when(inputContext).getTaskVertexName();
doReturn(TEST_ATTRIBUTES_INPUT_NAME).when(inputContext).getSourceVertexName();
doReturn(TEST_ATTRIBUTES_APPLICATION_ID).when(inputContext).getApplicationId();
doReturn(TEST_ATTRIBUTES_UNIQUE_IDENTIFIER).when(inputContext).getUniqueIdentifier();
DataSourceDescriptor dsd = MRInput.createConfigBuilder(new Configuration(false), TestInputFormat.class).groupSplits(false).build();
doReturn(dsd.getInputDescriptor().getUserPayload()).when(inputContext).getUserPayload();
doReturn(new TezCounters()).when(inputContext).getCounters();
MRInput mrInput = new MRInput(inputContext, 1);
mrInput.initialize();
MRRuntimeProtos.MRSplitProto splitProto = MRRuntimeProtos.MRSplitProto.newBuilder().setSplitClassName(TestInputSplit.class.getName()).build();
InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer());
List<Event> events = new LinkedList<>();
events.add(diEvent);
mrInput.handleEvents(events);
TezCounter counter = mrInput.getContext().getCounters().findCounter(TaskCounter.INPUT_SPLIT_LENGTH_BYTES);
assertEquals(counter.getValue(), TestInputSplit.length);
assertTrue(TestInputFormat.invoked.get());
}
use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.
the class TestHistoryParser method runWordCount.
private String runWordCount(String tokenizerProcessor, String summationProcessor, String dagName, boolean withTimeline) throws Exception {
// HDFS path
Path outputLoc = new Path("/tmp/outPath_" + System.currentTimeMillis());
DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf, TextInputFormat.class, inputLoc.toString()).build();
DataSinkDescriptor dataSink = MROutput.createConfigBuilder(conf, TextOutputFormat.class, outputLoc.toString()).build();
Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(tokenizerProcessor)).addDataSource(INPUT, dataSource);
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(summationProcessor), 1).addDataSink(OUTPUT, dataSink);
// Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
DAG dag = DAG.create(dagName);
dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
TezClient tezClient = getTezClient(withTimeline);
// Update Caller Context
CallerContext callerContext = CallerContext.create("TezExamples", "Tez WordCount Example Job");
ApplicationId appId = tezClient.getAppMasterApplicationId();
if (appId == null) {
appId = ApplicationId.newInstance(1001l, 1);
}
callerContext.setCallerIdAndType(appId.toString(), "TezApplication");
dag.setCallerContext(callerContext);
DAGClient client = tezClient.submitDAG(dag);
client.waitForCompletionWithStatusUpdates(Sets.newHashSet(StatusGetOpts.GET_COUNTERS));
TezDAGID tezDAGID = TezDAGID.getInstance(tezClient.getAppMasterApplicationId(), 1);
if (tezClient != null) {
tezClient.stop();
}
return tezDAGID.toString();
}
use of org.apache.tez.dag.api.DataSourceDescriptor in project hive by apache.
the class DagUtils method createVertexFromMapWork.
/*
* Helper function to create Vertex from MapWork.
*/
private Vertex createVertexFromMapWork(JobConf conf, MapWork mapWork, Path mrScratchDir, VertexType vertexType) throws Exception {
// set up the operator plan
Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
// create the directories FileSinkOperators need
Utilities.createTmpDirs(conf, mapWork);
// finally create the vertex
Vertex map = null;
// use tez to combine splits
boolean groupSplitsInInputInitializer;
DataSourceDescriptor dataSource;
int numTasks = -1;
@SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
LOG.info("Vertex has custom input? " + vertexHasCustomInput);
if (vertexHasCustomInput) {
groupSplitsInInputInitializer = false;
// it will be enabled in the CustomVertex.
if (inputFormatClass != BucketizedHiveInputFormat.class && inputFormatClass != HiveInputFormat.class) {
// As of now only these two formats are supported.
inputFormatClass = HiveInputFormat.class;
}
conf.setClass("mapred.input.format.class", inputFormatClass, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
// this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
} else {
// is HiveInputFormat
if (inputFormatClass == HiveInputFormat.class) {
groupSplitsInInputInitializer = true;
} else {
groupSplitsInInputInitializer = false;
}
}
if (mapWork instanceof MergeFileWork) {
Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
// prepare the tmp output directory. The output tmp directory should
// exist before jobClose (before renaming after job completion)
Path tempOutPath = Utilities.toTempPath(outputPath);
try {
FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
if (!tmpOutFS.exists(tempOutPath)) {
tmpOutFS.mkdirs(tempOutPath);
}
} catch (IOException e) {
throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
}
}
// remember mapping of plan to input
conf.set(Utilities.INPUT_NAME, mapWork.getName());
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
// set up the operator plan. (before setting up splits on the AM)
Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
// the correct plugin.
if (groupSplitsInInputInitializer) {
// Not setting a payload, since the MRInput payload is the same and can be accessed.
InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
} else {
// Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
if (vertexHasCustomInput && vertexType == VertexType.MULTI_INPUT_UNINITIALIZED_EDGES) {
// SMB Join.
dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
} else {
dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
}
}
} else {
// Setup client side split generation.
// we need to set this, because with HS2 and client side split
// generation we end up not finding the map work. This is
// because of thread local madness (tez split generation is
// multi-threaded - HS2 plan cache uses thread locals). Setting
// VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
// of the map work.
conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
InputSplitInfo inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(conf, false, 0);
InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName());
InputDescriptor inputDescriptor = InputDescriptor.create(MRInputLegacy.class.getName()).setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(conf)).setSplits(inputSplitInfo.getSplitsProto()).build().toByteString().asReadOnlyByteBuffer()));
dataSource = DataSourceDescriptor.create(inputDescriptor, descriptor, null);
numTasks = inputSplitInfo.getNumTasks();
// set up the operator plan. (after generating splits - that changes configs)
Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
}
UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
String procClassName = MapTezProcessor.class.getName();
if (mapWork instanceof MergeFileWork) {
procClassName = MergeFileTezProcessor.class.getName();
}
map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
map.setTaskEnvironment(getContainerEnvironment(conf, true));
assert mapWork.getAliasToWork().keySet().size() == 1;
// Add the actual source input
String alias = mapWork.getAliasToWork().keySet().iterator().next();
map.addDataSource(alias, dataSource);
return map;
}
Aggregations